**Loading the prerequisites**

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

**Loading the dataset**

In [9]:
df = pd.read_csv('/content/Raw_student_performance.csv')
df.head(2)

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
0,1.0,John,Male,85.0,15.0,78.0,1.0,High,80.0,4.8,59.0,False
1,2.0,Sarah,Female,90.0,20.0,85.0,2.0,Medium,87.0,2.2,70.0,True


##Preprocessing
1.Dropping the cols which is not required

In [10]:
df.drop(columns=["StudentID", "Name",], inplace=True)
df.head(2)

Unnamed: 0,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
0,Male,85.0,15.0,78.0,1.0,High,80.0,4.8,59.0,False
1,Female,90.0,20.0,85.0,2.0,Medium,87.0,2.2,70.0,True


In [21]:
df.drop(columns=["Attendance (%)", "Study Hours",], inplace=True)

In [12]:
df.shape

(1000, 10)

In [11]:
#checking if there is missing values or not
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Gender                     952 non-null    object 
 1   AttendanceRate             960 non-null    float64
 2   StudyHoursPerWeek          950 non-null    float64
 3   PreviousGrade              967 non-null    float64
 4   ExtracurricularActivities  957 non-null    float64
 5   ParentalSupport            978 non-null    object 
 6   FinalGrade                 960 non-null    float64
 7   Study Hours                976 non-null    float64
 8   Attendance (%)             959 non-null    float64
 9   Online Classes Taken       975 non-null    object 
dtypes: float64(7), object(3)
memory usage: 78.3+ KB


There is many missing values in the dataset

**2.Imputing missing values with mean for numerical cols**

In [13]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Gender                     952 non-null    object 
 1   AttendanceRate             1000 non-null   float64
 2   StudyHoursPerWeek          1000 non-null   float64
 3   PreviousGrade              1000 non-null   float64
 4   ExtracurricularActivities  1000 non-null   float64
 5   ParentalSupport            978 non-null    object 
 6   FinalGrade                 1000 non-null   float64
 7   Study Hours                1000 non-null   float64
 8   Attendance (%)             1000 non-null   float64
 9   Online Classes Taken       975 non-null    object 
dtypes: float64(7), object(3)
memory usage: 78.3+ KB


**3.Handle missing CATEGORICAL values**

In [15]:
cat_cols = df.select_dtypes(include=["object"]).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

  df[col] = df[col].fillna(df[col].mode()[0])


**4.Label emcoding categorical cols**

In [16]:
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [17]:
df.head()

Unnamed: 0,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,Attendance (%),Online Classes Taken
0,1,85.0,15.0,78.0,1.0,0,80.0,4.8,59.0,0
1,0,90.0,20.0,85.0,2.0,2,87.0,2.2,70.0,1
2,1,78.0,10.0,65.0,0.0,1,68.0,4.6,92.0,0
3,1,92.0,25.0,90.0,3.0,0,92.0,2.9,96.0,0
4,0,85.510417,18.0,82.0,2.0,2,85.0,4.1,97.0,1


In [22]:
print(df.columns.tolist())


['Gender', 'AttendanceRate', 'StudyHoursPerWeek', 'PreviousGrade', 'ExtracurricularActivities', 'ParentalSupport', 'FinalGrade', 'Online Classes Taken']


In [23]:
# Save cleaned data
df.to_csv("cleaned_student_data.csv", index=False)