# Imports

In [9]:
import pandas as pd

# Downloading and Reading Data

In [10]:
import kagglehub
# Download latest version of the dataset
path = kagglehub.dataset_download("lainguyn123/student-performance-factors")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\Sarvamm\.cache\kagglehub\datasets\lainguyn123\student-performance-factors\versions\8


In [11]:
data = pd.read_csv(r"C:\Users\Sarvamm\Documents\Repos\Data-Science-Projects\Student Performance Factors\data\raw\StudentPerformanceFactors.csv")

# Memory Usage

In [12]:
#code to check memory used
print("Memory usage before optimization:")
print(data.memory_usage(deep=True).sum() / 1024**2, "MB")

Memory usage before optimization:
4.772937774658203 MB


# Data Cleaning

In [None]:

def clean_data(data) -> pd.DataFrame:
    # Drop duplicate rows across all columns
    data = data.drop_duplicates()
    # Drop rows with missing data across all columns
    data = data.dropna()
    # Change column type to int8 for columns: 'Sleep_Hours', 'Hours_Studied' and 5 other columns
    data = data.astype({'Sleep_Hours': 'int8', 'Hours_Studied': 'int8', 'Attendance': 'int8', 'Previous_Scores': 'int8', 'Tutoring_Sessions': 'int8', 'Physical_Activity': 'int8', 'Exam_Score': 'int8'})
    df = data
    df['numeric_Parental_Involvement'] = df['Parental_Involvement'].map({'Low': 0, 'Medium': 1, 'High': 2})
    df['numeric_Access_to_Resources'] = df['Access_to_Resources'].map({'Low': 0, 'Medium': 1, 'High': 2})
    df['numeric_Motivation_Level'] = df['Motivation_Level'].map({'Low': 0, 'Medium': 1, 'High': 2})
    df['numeric_Family_Income'] = df['Family_Income'].map({'Low': 0, 'Medium': 1, 'High': 2})
    df['numeric_Teacher_Quality'] = df['Teacher_Quality'].map({'Low': 0, 'Medium': 1, 'High': 2})
    # Derive column 'bool_Extracurricular_Activities' from column: 'Extracurricular_Activities'
    def bool_Extracurricular_Activities(Extracurricular_Activities):
        """
        Transform based on the following examples:
           Extracurricular_Activities    Output
        1: "Yes"                      => "1"
        2: "No"                       => "0"
        """
        if Extracurricular_Activities == "Yes":
            return True
        if Extracurricular_Activities == "No":
            return False
        return None
    data.insert(5, "bool_Extracurricular_Activities", data.apply(lambda row : bool_Extracurricular_Activities(row["Extracurricular_Activities"]), axis=1))
    # Change column type to bool for column: 'bool_Extracurricular_Activities'
    data = data.astype({'bool_Extracurricular_Activities': 'bool'})
    # Derive column 'bool_Internet_Access' from column: 'Internet_Access'
    def bool_Internet_Access(Internet_Access):
        """
        Transform based on the following examples:
           Internet_Access    Output
        1: "Yes"           => "True"
        2: "No"            => "False"
        """
        if Internet_Access == "Yes":
            return True
        if Internet_Access == "No":
            return False
        return None
    data.insert(10, "bool_Internet_Access", data.apply(lambda row : bool_Internet_Access(row["Internet_Access"]), axis=1))
    # One-hot encode column: 'School_Type'
    insert_loc = data.columns.get_loc('School_Type')
    data = pd.concat([data.iloc[:,:insert_loc], pd.get_dummies(data.loc[:, ['School_Type']]), data.iloc[:,insert_loc+1:]], axis=1)
    data['numeric_Peer_Influence'] = data['Peer_Influence'].map({
        'Negative' : -1,
        'Neutral' : 0,
        'Positive' : 1
    })
    data['numeric_Learning_Disabilities'] = data['Learning_Disabilities'].map({
        'No' : False,
        'Yes' : True
    })
    # Derive column 'numeric_Parental_Education_Level' from column: 'Parental_Education_Level'
    def numeric_Parental_Education_Level(Parental_Education_Level):
        """
        Transform based on the following examples:
           Parental_Education_Level    Output
        1: "High School"            => "0"
        2: "College"                => "1"
        3: "Postgraduate"           => "2"
        """
        if len(Parental_Education_Level) - len(Parental_Education_Level.replace(" ", "")) == 1:
            return "0"
        if Parental_Education_Level == "College":
            return "1"
        if Parental_Education_Level == "Postgraduate":
            return "2"
        return None
    data.insert(20, "numeric_Parental_Education_Level", data.apply(lambda row : numeric_Parental_Education_Level(row["Parental_Education_Level"]), axis=1))
    # Change column type to int8 for column: 'numeric_Parental_Education_Level'
    data = data.astype({'numeric_Parental_Education_Level': 'int8'})
    # Derive column 'numeric_Distance_from_Home' from column: 'Distance_from_Home'
    def numeric_Distance_from_Home(Distance_from_Home):
        """
        Transform based on the following examples:
           Distance_from_Home    Output
        1: "Near"             => "0"
        2: "Moderate"         => "1"
        3: "Far"              => "2"
        """
        if Distance_from_Home == "Near":
            return "0"
        if Distance_from_Home == "Moderate":
            return "1"
        if Distance_from_Home == "Far":
            return "2"
        return None
    data.insert(22, "numeric_Distance_from_Home", data.apply(lambda row : numeric_Distance_from_Home(row["Distance_from_Home"]), axis=1))
    # Change column type to int8 for column: 'numeric_Distance_from_Home'
    data = data.astype({'numeric_Distance_from_Home': 'int8'})
    data = data.astype({'numeric_Peer_Influence': 'int8', 'numeric_Teacher_Quality': 'int8', 'numeric_Family_Income': 'int8', 'numeric_Motivation_Level': 'int8', 'numeric_Access_to_Resources': 'int8', 'numeric_Parental_Involvement': 'int8'})
    return data

df = clean_data(data.copy())
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,bool_Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,...,numeric_Distance_from_Home,Gender,Exam_Score,numeric_Parental_Involvement,numeric_Access_to_Resources,numeric_Motivation_Level,numeric_Family_Income,numeric_Teacher_Quality,numeric_Peer_Influence,numeric_Learning_Disabilities
0,23,84,Low,High,No,False,7,73,Low,Yes,...,0,Male,67,0,2,0,0,1,1,False
1,19,64,Low,Medium,No,False,8,59,Low,Yes,...,1,Female,61,0,1,0,1,1,-1,False
2,24,98,Medium,Medium,Yes,True,7,91,Medium,Yes,...,0,Male,74,1,1,1,1,1,0,False
3,29,89,Low,Medium,Yes,True,8,98,Medium,Yes,...,1,Male,71,0,1,1,1,1,-1,False
4,19,92,Medium,Medium,Yes,True,6,65,Medium,Yes,...,0,Female,70,1,1,1,1,2,0,False


# Saved some memory

In [15]:
#code to check memory used
print("Memory usage after optimization:")
print(df.memory_usage(deep=True).sum() / 1024**2, "MB")

Memory usage after optimization:
4.105851173400879 MB


# Saving the data

In [None]:
import os
_ = os.path.dirname(os.getcwd())
_

In [24]:
df.to_csv(os.path.join(_, "data", "processed", "data.csv"), index=False)