In [98]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [99]:
data_file_path = r"data/data_after_eda.csv"
df = pd.read_csv(data_file_path)

In [100]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,avg_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [101]:
df.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score', 'total_score', 'avg_score'],
      dtype='object')

In [102]:
# renaming columns
df.columns = df.columns.str.replace('_' , ' ')

In [103]:
df.columns

Index(['gender', 'race ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score', 'total score', 'avg score'],
      dtype='object')

In [104]:
df['race ethnicity'].unique()

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [105]:
# removing the group word from the race 
race_map = {
    'group A' : 'A',
    'group B' : 'B', 
    'group C' : 'C', 
    'group D' : 'D', 
    'group E' : 'E'
}

df['race ethnicity'] = df['race ethnicity'].map(race_map)

In [106]:
df.head()

Unnamed: 0,gender,race ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,avg score
0,female,B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,C,some college,standard,completed,69,90,88,247,82.333333
2,female,B,master's degree,standard,none,90,95,93,278,92.666667
3,male,A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,C,some college,standard,none,76,78,75,229,76.333333


In [107]:
df.rename(columns = {'race ethnicity' : 'race ethnicity group'} , inplace = True)

In [108]:
df.head()

Unnamed: 0,gender,race ethnicity group,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,avg score
0,female,B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,C,some college,standard,completed,69,90,88,247,82.333333
2,female,B,master's degree,standard,none,90,95,93,278,92.666667
3,male,A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,C,some college,standard,none,76,78,75,229,76.333333


## Fixing parental level of education

In [109]:
df['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [110]:
education_map = {
    "bachelor's degree": 'bachelor',
    "some college": 'some_college',
    "master's degree": 'masters',
    "associate's degree": 'associates',
    "high school": 'high_school',
    "some high school": 'some_high_school'
}

df['parental level of education'] = df['parental level of education'].map(education_map)


In [111]:
df.head()

Unnamed: 0,gender,race ethnicity group,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,avg score
0,female,B,bachelor,standard,none,72,72,74,218,72.666667
1,female,C,some_college,standard,completed,69,90,88,247,82.333333
2,female,B,masters,standard,none,90,95,93,278,92.666667
3,male,A,associates,free/reduced,none,47,57,44,148,49.333333
4,male,C,some_college,standard,none,76,78,75,229,76.333333


In [112]:
df['test preparation course'].unique() # this col we will handle on preprocessing phase

array(['none', 'completed'], dtype=object)

In [113]:
df.head()

Unnamed: 0,gender,race ethnicity group,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,avg score
0,female,B,bachelor,standard,none,72,72,74,218,72.666667
1,female,C,some_college,standard,completed,69,90,88,247,82.333333
2,female,B,masters,standard,none,90,95,93,278,92.666667
3,male,A,associates,free/reduced,none,47,57,44,148,49.333333
4,male,C,some_college,standard,none,76,78,75,229,76.333333


In [114]:
df.head()

Unnamed: 0,gender,race ethnicity group,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,avg score
0,female,B,bachelor,standard,none,72,72,74,218,72.666667
1,female,C,some_college,standard,completed,69,90,88,247,82.333333
2,female,B,masters,standard,none,90,95,93,278,92.666667
3,male,A,associates,free/reduced,none,47,57,44,148,49.333333
4,male,C,some_college,standard,none,76,78,75,229,76.333333


In [117]:
df = df.drop(columns = ['total score' , 'avg score'])

In [118]:
df.head()

Unnamed: 0,gender,race ethnicity group,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,B,bachelor,standard,none,72,72,74
1,female,C,some_college,standard,completed,69,90,88
2,female,B,masters,standard,none,90,95,93
3,male,A,associates,free/reduced,none,47,57,44
4,male,C,some_college,standard,none,76,78,75


In [119]:
# save the cleaned file 
cleaned_data_file_path = r"data/cleaned_students.csv"
df.to_csv(cleaned_data_file_path , index = False)

## Final Task

- We are going to make a model which can predict the math score based on gender , race , parental education , lunch , test preparetion , reading and writing score