In [2]:
import pandas as pd
import os

In [13]:
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # Fallback for Jupyter or interactive mode
    BASE_DIR = os.getcwd()

file_path = os.path.join(BASE_DIR, "data", "StudentPerformanceFactors.csv")
df = pd.read_csv(file_path)


In [14]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [15]:
print(df.head())

   Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0             23          84                  Low                High   
1             19          64                  Low              Medium   
2             24          98               Medium              Medium   
3             29          89                  Low              Medium   
4             19          92               Medium              Medium   

  Extracurricular_Activities  Sleep_Hours  Previous_Scores Motivation_Level  \
0                         No            7               73              Low   
1                         No            8               59              Low   
2                        Yes            7               91           Medium   
3                        Yes            8               98           Medium   
4                        Yes            6               65           Medium   

  Internet_Access  Tutoring_Sessions Family_Income Teacher_Quality  \
0             Ye

In [12]:
rows, cols = df.shape
print(f"The dataset contains {rows} rows and {cols} columns.")

The dataset contains 6607 rows and 20 columns.


In [19]:
df.dtypes

Hours_Studied                  int64
Attendance                     int64
Parental_Involvement          object
Access_to_Resources           object
Extracurricular_Activities    object
Sleep_Hours                    int64
Previous_Scores                int64
Motivation_Level              object
Internet_Access               object
Tutoring_Sessions              int64
Family_Income                 object
Teacher_Quality               object
School_Type                   object
Peer_Influence                object
Physical_Activity              int64
Learning_Disabilities         object
Parental_Education_Level      object
Distance_from_Home            object
Gender                        object
Exam_Score                     int64
dtype: object

In [20]:
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include='number').columns

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)


Categorical columns: Index(['Parental_Involvement', 'Access_to_Resources',
       'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access',
       'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence',
       'Learning_Disabilities', 'Parental_Education_Level',
       'Distance_from_Home', 'Gender'],
      dtype='object')
Numerical columns: Index(['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores',
       'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score'],
      dtype='object')


# Data cleaning

In [16]:
# check missing data
df.isnull().sum()


Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
Exam_Score                     0
dtype: int64

In [21]:
df['Teacher_Quality'].fillna(df['Teacher_Quality'].mode()[0], inplace=True)
df['Parental_Education_Level'].fillna(df['Parental_Education_Level'].mode()[0], inplace=True)
df['Distance_from_Home'].fillna(df['Distance_from_Home'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Teacher_Quality'].fillna(df['Teacher_Quality'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Parental_Education_Level'].fillna(df['Parental_Education_Level'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method wi

In [23]:
#label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df = df.copy()
scaled_df[df.columns] = scaler.fit_transform(df[df.columns])


In [25]:
df.drop_duplicates(inplace=True)


In [26]:
print(scaled_df.describe().T)


                             count          mean       std       min  \
Hours_Studied               6607.0 -1.161474e-16  1.000076 -3.167760   
Attendance                  6607.0 -3.089199e-16  1.000076 -1.730158   
Parental_Involvement        6607.0  2.823028e-17  1.000076 -1.409559   
Access_to_Resources         6607.0 -2.688598e-18  1.000076 -1.380517   
Extracurricular_Activities  6607.0  1.312036e-16  1.000076 -1.214685   
Sleep_Hours                 6607.0 -2.016449e-16  1.000076 -2.063380   
Previous_Scores             6607.0  2.048712e-16  1.000076 -1.741167   
Motivation_Level            6607.0 -2.054089e-16  1.000076 -1.671088   
Internet_Access             6607.0 -8.818602e-17  1.000076 -3.498640   
Tutoring_Sessions           6607.0  7.850707e-17  1.000076 -1.213934   
Family_Income               6607.0 -4.194213e-17  1.000076 -1.632214   
Teacher_Quality             6607.0  9.463866e-17  1.000076 -1.462550   
School_Type                 6607.0  1.226001e-16  1.000076 -1.51

In [27]:
X = df.drop('Exam_Score', axis=1)
y = df['Exam_Score']
X_scaled = scaler.fit_transform(X)
scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
scaled_df['Exam_Score'] = y  # reattach target


In [28]:
output_path = os.path.join(BASE_DIR, "data", "StudentPerformanceFactors_Cleaned.csv")
scaled_df.to_csv(output_path, index=False)
print(f"✅ Cleaned dataset saved to: {output_path}")


✅ Cleaned dataset saved to: f:\Omar 3amora\Elevvo Tech\Task 1 Student Score Prediction\data\StudentPerformanceFactors_Cleaned.csv
