Data Wrangling II
Create an “Academic performance” dataset of students and perform the following operations using
Python.
1. Scan all variables for missing values and inconsistencies. If there are missing values and/or
inconsistencies, use any of the suitable techniques to deal with them.
2. Scan all numeric variables for outliers. If there are outliers, use any of the suitable
techniques to deal with them.
3. Apply data transformations on at least one of the variables. The purpose of this
transformation should be one of the following reasons: to change the scale for better
understanding of the variable, to convert a non-linear relation into a linear one, or to
decrease the skewness and convert the distribution into a normal distribution.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path


In [10]:
df = pd.read_csv('/content/StudentPerformanceFactors.csv')
df

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,High,Medium,Public,Positive,2,No,High School,Near,Female,68
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,High,Public,Positive,2,No,High School,Near,Female,69
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,Medium,Public,Negative,2,No,Postgraduate,Near,Female,68
6605,10,86,High,High,Yes,6,91,High,Yes,2,Low,Medium,Private,Positive,3,No,High School,Far,Female,68


In [11]:
# 1. SCAN & DEAL WITH MISSING VALUES AND INCONSISTENCIES
print("--- Step 1: Missing Values ---")
print(df.isnull().sum())  # Scan for nulls

# Handle Inconsistencies: standardizing categorical text (strip spaces & fix casing)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip().str.title()
print("\nInconsistencies handled (Text standardized to Title Case).")

--- Step 1: Missing Values ---
Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64

Inconsistencies handled (Text standardized to Title Case).


In [12]:
# 2. SCAN & DEAL WITH OUTLIERS
print("\n--- Step 2: Outliers (IQR Method) ---")
numeric_cols = df.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

    # Identify count
    outlier_count = ((df[col] < lower) | (df[col] > upper)).sum()
    print(f"{col}: {outlier_count} outliers found.")

    # Handle: Capping (Winsorization)
    df[col] = np.clip(df[col], lower, upper)


--- Step 2: Outliers (IQR Method) ---
Hours_Studied: 43 outliers found.
Attendance: 0 outliers found.
Sleep_Hours: 0 outliers found.
Previous_Scores: 0 outliers found.
Tutoring_Sessions: 430 outliers found.
Physical_Activity: 0 outliers found.
Exam_Score: 104 outliers found.


In [15]:
# --- Step 3: Data Transformation ---
# 1. Log Transformation to decrease skewness
df['Hours_Studied_log'] = np.log1p(df['Hours_Studied'])

# 2. Z-Score Scaling to change the scale (standardization)
df['Exam_Score_scaled'] = (df['Exam_Score'] - df['Exam_Score'].mean()) / df['Exam_Score'].std()

print("--- Step 3: Transformation Results ---")
print(df[['Hours_Studied', 'Hours_Studied_log', 'Exam_Score', 'Exam_Score_scaled']].head())

--- Step 3: Transformation Results ---
   Hours_Studied  Hours_Studied_log  Exam_Score  Exam_Score_scaled
0             23           3.178054          67          -0.037694
1             19           2.995732          61          -1.816574
2             24           3.218876          74           2.037667
3             29           3.401197          71           1.148227
4             19           2.995732          70           0.851746
