In [7]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

In [8]:
df = pd.read_csv("StudentsPerformance.csv")


In [9]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,female,group C,some college,standard,completed,69.0,90.0,88.0
2,female,group B,master's degree,standard,none,90.0,95.0,93.0
3,male,group A,associate's degree,free/reduced,none,47.0,57.0,
4,male,group C,some college,standard,none,76.0,78.0,75.0
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88.0,99.0,95.0
996,male,group C,high school,free/reduced,none,62.0,55.0,55.0
997,female,group C,high school,free/reduced,completed,59.0,71.0,65.0
998,female,group D,some college,standard,completed,68.0,78.0,77.0


In [10]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,994.0,994.0,997.0
mean,66.230382,69.482897,68.402207
std,16.054484,18.283165,18.123287
min,0.0,11.0,0.0
25%,57.0,59.0,58.0
50%,66.0,70.0,69.0
75%,77.0,79.75,79.0
max,200.0,400.0,350.0


In [11]:
df.head() # describes first 5 records

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,female,group C,some college,standard,completed,69.0,90.0,88.0
2,female,group B,master's degree,standard,none,90.0,95.0,93.0
3,male,group A,associate's degree,free/reduced,none,47.0,57.0,
4,male,group C,some college,standard,none,76.0,78.0,75.0


In [12]:
df.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88.0,99.0,95.0
996,male,group C,high school,free/reduced,none,62.0,55.0,55.0
997,female,group C,high school,free/reduced,completed,59.0,71.0,65.0
998,female,group D,some college,standard,completed,68.0,78.0,77.0
999,female,group D,some college,free/reduced,none,77.0,86.0,86.0


In [13]:
df.shape

(1000, 8)

In [14]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [15]:
df.dtypes

gender                          object
race/ethnicity                  object
parental level of education     object
lunch                           object
test preparation course         object
math score                     float64
reading score                  float64
writing score                  float64
dtype: object

# Check missing values

In [16]:
df.isnull().sum()


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     6
reading score                  6
writing score                  3
dtype: int64

# Fill null values with 0


In [21]:
df1 = df.fillna(0)

In [22]:
df3 = df.dropna()

In [23]:
df3

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,female,group C,some college,standard,completed,69.0,90.0,88.0
2,female,group B,master's degree,standard,none,90.0,95.0,93.0
4,male,group C,some college,standard,none,76.0,78.0,75.0
5,female,group B,associate's degree,standard,none,71.0,83.0,78.0
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88.0,99.0,95.0
996,male,group C,high school,free/reduced,none,62.0,55.0,55.0
997,female,group C,high school,free/reduced,completed,59.0,71.0,65.0
998,female,group D,some college,standard,completed,68.0,78.0,77.0


In [24]:
norm = stats.zscore(df1['math score'])

In [25]:
norm

array([ 0.36717239,  0.18855764,  1.43886088, -1.12128385,  0.60532538,
        0.30763414,  1.31978438,  3.22500836, -0.10913361, -1.6571281 ,
       -0.46636311, -1.5380516 , -3.91958158,  0.72440188, -0.94266911,
        0.18855764,  1.31978438, -2.84789309, -1.1808221 , -0.70451611,
        0.00994289, -0.04959536, -1.2998986 ,  0.18855764,  0.48624888,
        0.42671063,  0.18855764,  0.06948114,  0.24809589, -0.22821011,
        0.18855764, -0.16867186, -0.58543961, -1.5380516 ,  1.85562862,
        0.90301663,  0.48624888, -0.94266911,  0.54578713, -3.32419909,
       -0.64497786, -0.46636311, -0.76405436, -0.40682486, -0.94266911,
       -0.04959536, -0.64497786,  0.00994289, -0.52590136,  0.96255488,
       -0.76405436,  0.66486363, -0.76405436,  1.31978438,  0.30763414,
       -1.95481935,  0.96255488, -0.82359261, -0.46636311, -3.91958158,
        0.78394013, -1.59758985, -0.22821011,  0.18855764, -0.40682486,
        0.06948114, -1.24036035, -0.34728661, -0.28774836, -1.59

In [36]:
dm = pd.get_dummies(df1[['gender', 'race/ethnicity', 'lunch']])

In [37]:
dm

Unnamed: 0,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,lunch_standard
0,True,False,False,True,False,False,False,False,True
1,True,False,False,False,True,False,False,False,True
2,True,False,False,True,False,False,False,False,True
3,False,True,True,False,False,False,False,True,False
4,False,True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...
995,True,False,False,False,False,False,True,False,True
996,False,True,False,False,True,False,False,True,False
997,True,False,False,False,True,False,False,True,False
998,True,False,False,False,False,True,False,False,True
