In [9]:
import pandas as pd
from sklearn import preprocessing

In [2]:
print(pd.__version__)

1.1.0


In [12]:
exam_data = pd.read_csv('data/exams.csv', quotechar='"')
exam_data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group E,associate's degree,standard,completed,79,75,81
1,female,group C,associate's degree,free/reduced,none,56,65,64
2,male,group D,bachelor's degree,standard,none,86,68,74
3,female,group A,bachelor's degree,standard,none,68,78,76
4,female,group D,high school,free/reduced,none,49,68,61
...,...,...,...,...,...,...,...,...
95,female,group A,master's degree,free/reduced,completed,59,76,80
96,female,group D,high school,free/reduced,none,60,62,61
97,female,group C,some college,standard,none,69,82,83
98,male,group D,some high school,standard,none,89,83,79


In [13]:
math_average = exam_data["math score"].mean()
reading_average = exam_data["reading score"].mean()
writing_average = exam_data["writing score"].mean()

print(f'Math Average {math_average}')
print(f'Reading Average {reading_average}')
print(f'Writing Average {writing_average}')

Math Average 65.06
Reading Average 67.28
Writing Average 66.47


In [15]:
# standardadization
exam_data[["math_score"]] = preprocessing.scale(exam_data[["math score"]])
exam_data[["reading_score"]] = preprocessing.scale(exam_data[["reading score"]])
exam_data[["writing_score"]] = preprocessing.scale(exam_data[["writing score"]])


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,math_score,reading_score,writing_score
0,male,group E,associate's degree,standard,completed,79,75,81,0.994557,0.574138,1.049901
1,female,group C,associate's degree,free/reduced,none,56,65,64,-0.646391,-0.169564,-0.178476
2,male,group D,bachelor's degree,standard,none,86,68,74,1.493976,0.053547,0.544099
3,female,group A,bachelor's degree,standard,none,68,78,76,0.209756,0.797248,0.688613
4,female,group D,high school,free/reduced,none,49,68,61,-1.145810,0.053547,-0.395248
...,...,...,...,...,...,...,...,...,...,...,...
95,female,group A,master's degree,free/reduced,completed,59,76,80,-0.432354,0.648508,0.977643
96,female,group D,high school,free/reduced,none,60,62,61,-0.361008,-0.392675,-0.395248
97,female,group C,some college,standard,none,69,82,83,0.281101,1.094729,1.194416
98,male,group D,some high school,standard,none,89,83,79,1.708013,1.169099,0.905386


In [18]:
le = preprocessing.LabelEncoder()
exam_data["gender"] = le.fit_transform(exam_data['gender'].astype(str))
exam_data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,math_score,reading_score,writing_score
0,1,group E,associate's degree,standard,completed,79,75,81,0.994557,0.574138,1.049901
1,0,group C,associate's degree,free/reduced,none,56,65,64,-0.646391,-0.169564,-0.178476
2,1,group D,bachelor's degree,standard,none,86,68,74,1.493976,0.053547,0.544099
3,0,group A,bachelor's degree,standard,none,68,78,76,0.209756,0.797248,0.688613
4,0,group D,high school,free/reduced,none,49,68,61,-1.14581,0.053547,-0.395248


In [19]:
le.classes_

array(['0', '1'], dtype=object)

In [20]:
pd.get_dummies(exam_data['race/ethnicity'])

Unnamed: 0,group A,group B,group C,group D,group E
0,0,0,0,0,1
1,0,0,1,0,0
2,0,0,0,1,0
3,1,0,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
95,1,0,0,0,0
96,0,0,0,1,0
97,0,0,1,0,0
98,0,0,0,1,0


In [21]:
exam_data = pd.get_dummies(exam_data, columns=['race/ethnicity'])

In [22]:
exam_data

Unnamed: 0,gender,parental level of education,lunch,test preparation course,math score,reading score,writing score,math_score,reading_score,writing_score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E
0,1,associate's degree,standard,completed,79,75,81,0.994557,0.574138,1.049901,0,0,0,0,1
1,0,associate's degree,free/reduced,none,56,65,64,-0.646391,-0.169564,-0.178476,0,0,1,0,0
2,1,bachelor's degree,standard,none,86,68,74,1.493976,0.053547,0.544099,0,0,0,1,0
3,0,bachelor's degree,standard,none,68,78,76,0.209756,0.797248,0.688613,1,0,0,0,0
4,0,high school,free/reduced,none,49,68,61,-1.145810,0.053547,-0.395248,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,master's degree,free/reduced,completed,59,76,80,-0.432354,0.648508,0.977643,1,0,0,0,0
96,0,high school,free/reduced,none,60,62,61,-0.361008,-0.392675,-0.395248,0,0,0,1,0
97,0,some college,standard,none,69,82,83,0.281101,1.094729,1.194416,0,0,1,0,0
98,1,some high school,standard,none,89,83,79,1.708013,1.169099,0.905386,0,0,0,1,0


In [23]:
exam_data = pd.get_dummies(exam_data, columns=['parental level of education', 'lunch', 'test preparation course'])
exam_data.head()

Unnamed: 0,gender,math score,reading score,writing score,math_score,reading_score,writing_score,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,...,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,1,79,75,81,0.994557,0.574138,1.049901,0,0,0,...,1,0,0,0,0,0,0,1,1,0
1,0,56,65,64,-0.646391,-0.169564,-0.178476,0,0,1,...,1,0,0,0,0,0,1,0,0,1
2,1,86,68,74,1.493976,0.053547,0.544099,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3,0,68,78,76,0.209756,0.797248,0.688613,1,0,0,...,0,1,0,0,0,0,0,1,0,1
4,0,49,68,61,-1.14581,0.053547,-0.395248,0,0,0,...,0,0,1,0,0,0,1,0,0,1


In [None]:
# preprocessing text data