In [1]:
#importing packages for statistical analysis and data plotting
import pandas as pd
print("Pandas imported successfully, version:"+pd.__version__)
import statsmodels as sm
print("Statsmodels imported successfully, version:"+sm.__version__)

import scipy
from scipy.stats import *
print("SciPy imported successfully, version: "+scipy.__version__)

import matplotlib
import plotly
print("Plotly imported successfully, version: "+plotly.__version__)
from plotly.graph_objs import * # for importing the different types of plots

from sklearn.decomposition import PCA #For performing principle component analysis

# The following are necessary to run plotly in an offline mode, because its otherwise an online plotting library
from plotly.graph_objs import Scatter, Layout
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import Box
from plotly.graph_objs import Histogram

Pandas imported successfully, version:0.25.1
Statsmodels imported successfully, version:0.10.1
SciPy imported successfully, version: 1.3.1
Plotly imported successfully, version: 4.4.1


In [2]:
#importing dataset
data_all=pd.read_csv('C:\\Users\Rimante\Documents\GitHub\Pirmas_projektas\StudentsPerformance.csv')

In [3]:
data_all.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
data_all.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [5]:
#more detailed info about the dataset
print(data_all.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
gender                         1000 non-null object
race/ethnicity                 1000 non-null object
parental level of education    1000 non-null object
lunch                          1000 non-null object
test preparation course        1000 non-null object
math score                     1000 non-null int64
reading score                  1000 non-null int64
writing score                  1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None


8 columns:
5 of discrete nominal data (object),
3 of continuous data (int64). No missing values.

In [6]:
#finding unique values for all discrete nominal variables
print(data_all['gender'].unique())
print(data_all['race/ethnicity'].unique())
print(data_all['parental level of education'].unique())
print(data_all['lunch'].unique())
print(data_all['test preparation course'].unique())

['female' 'male']
['group B' 'group C' 'group A' 'group D' 'group E']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
['standard' 'free/reduced']
['none' 'completed']


Variables "gender", "lunch" and "test preparation course" have 2 unique values each;
variable "race/ethnicity" has 5 and "parental level of education" - 6 unique values.

In [7]:
#looking at the counts of each unique value for all discrete nominal variables
print(data_all['gender'].value_counts())
print(data_all['race/ethnicity'].value_counts())
print(data_all['parental level of education'].value_counts())
print(data_all['lunch'].value_counts())
print(data_all['test preparation course'].value_counts())

female    518
male      482
Name: gender, dtype: int64
group C    319
group D    262
group B    190
group E    140
group A     89
Name: race/ethnicity, dtype: int64
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64
standard        645
free/reduced    355
Name: lunch, dtype: int64
none         642
completed    358
Name: test preparation course, dtype: int64


In [58]:
#description of continuos variables in the dataset
data_all.describe().round(1)

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.1,69.2,68.1
std,15.2,14.6,15.2
min,0.0,17.0,10.0
25%,57.0,59.0,57.8
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [98]:
# Showing the values of all three exams scores in a box plot
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Box( y=data_all['math score'], name="Math", showlegend=True ) )
to_plot.append(Box( y=data_all['reading score'], name="Reading", showlegend=True ) )
to_plot.append(Box( y=data_all['writing score'], name="Writing", showlegend=True ) )

# Layout, descriptions of the plot's style
layout = Layout(
    title="Total exam scores",
    xaxis=dict(
        title="Exam type",
        titlefont=dict(
            size=18,
            color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Score",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

In [99]:
# Histogram of all exam scores (to check whether the data is normal)
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Histogram( x=data_all['math score'], name="Math", showlegend=True ) )
to_plot.append(Histogram( x=data_all['reading score'], name="Reading", showlegend=True ) )
to_plot.append(Histogram( x=data_all['writing score'], name="Writing", showlegend=True ) )

# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(
    title="Histogram of student scores",
    xaxis=dict(
        title="Writing score",
        titlefont=dict(
            size=18,
            color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Counts",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

In [10]:
#determining the correlation between math and writing scores
(r, p) = pearsonr(data_all["math score"], data_all["writing score"])
print("Pearson R value is: %.5f"%r + " with a p-value of: %.5e"%p)

Pearson R value is: 0.80264 with a p-value of: 3.37603e-226


In [11]:
#drawing a linear regressiong plot
slope, intercept, r_value, p_value, std_err = stats.linregress(data_all["math score"], data_all["writing score"])
print("Pearson R value is: %.5f"%r_value + " with a p-value of: %.5e"%p_value)

# This part sets up the object, that contains your data to be plotted: lists of x and y values
# That object is called trace0
trace0 = Scatter(
    x = data_all["math score"],
    y = data_all["writing score"],
    mode = 'markers',
    name = 'Math and writing scores relationship',
)

line = slope*data_all["math score"]+intercept # Lets create a line, that uses x argument as x values
# trace1 holds the data for plotting the line
trace1 = Scatter(
    x = data_all["math score"],
    y = line,
    mode = 'lines',
    name = 'Line fit'
)
# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(title="Math and writing scores relationship",xaxis=dict(title="math score"),yaxis=dict(title="writing score"),)
plotly.offline.iplot({"data": [trace0, trace1],"layout": layout})

Pearson R value is: 0.80264 with a p-value of: 3.37603e-226


In [12]:
#determining the correlation between math and reading scores
(r, p) = pearsonr(data_all["math score"], data_all["reading score"])
print("Pearson R value is: %.5f"%r + " with a p-value of: %.5e"%p)

Pearson R value is: 0.81758 with a p-value of: 1.78775e-241


In [13]:
#drawing a linear regressiong plot
slope, intercept, r_value, p_value, std_err = stats.linregress(data_all["math score"], data_all["reading score"])
print("Pearson R value is: %.5f"%r_value + " with a p-value of: %.5e"%p_value)

# This part sets up the object, that contains your data to be plotted: lists of x and y values
# That object is called trace0
trace0 = Scatter(
    x = data_all["math score"],
    y = data_all["reading score"],
    mode = 'markers',
    name = 'Math and reading scores relationship',
)

line = slope*data_all["math score"]+intercept # Lets create a line, that uses x argument as x values
# trace1 holds the data for plotting the line
trace1 = Scatter(
    x = data_all["math score"],
    y = line,
    mode = 'lines',
    name = 'Line fit'
)
# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(title="Math and reading scores relationship",xaxis=dict(title="math score"),yaxis=dict(title="reading score"),)
plotly.offline.iplot({"data": [trace0, trace1],"layout": layout})

Pearson R value is: 0.81758 with a p-value of: 1.78775e-241


In [14]:
#determining the correlation between reading and writing scores
(r, p) = pearsonr(data_all["reading score"], data_all["writing score"])
print("Pearson R value is: %.5f"%r + " with a p-value of: %.5e"%p)

Pearson R value is: 0.95460 with a p-value of: 0.00000e+00


In [15]:
#drawing a linear regressiong plot
slope, intercept, r_value, p_value, std_err = stats.linregress(data_all["reading score"], data_all["writing score"])
print("Pearson R value is: %.5f"%r_value + " with a p-value of: %.5e"%p_value)

# This part sets up the object, that contains your data to be plotted: lists of x and y values
# That object is called trace0
trace0 = Scatter(
    x = data_all["reading score"],
    y = data_all["writing score"],
    mode = 'markers',
    name = 'Reading and writing scores relationship',
)

line = slope*data_all["reading score"]+intercept # Lets create a line, that uses x argument as x values
# trace1 holds the data for plotting the line
trace1 = Scatter(
    x = data_all["reading score"],
    y = line,
    mode = 'lines',
    name = 'Line fit'
)
# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(title="Reading and writing scores relationship",xaxis=dict(title="reading score"),yaxis=dict(title="writing score"),)
plotly.offline.iplot({"data": [trace0, trace1],"layout": layout})

Pearson R value is: 0.95460 with a p-value of: 0.00000e+00


It can be seen, that the correlation between all exams are quite high, meaning that better prepared student tend to get higher grades for all exams. Moreover, there is a very strong correlation between reading and writing scores.

In [16]:
# Looking at the exam scores difference based on lunch type 
# Choosing the rows where lunch type is "standard"
lunch_st_rows = data_all["lunch"] == "standard"
lunch_st = data_all[lunch_st_rows]

# Choosing the rows where lunch type is "free/reduced"
lunch_red_rows = data_all["lunch"] == "free/reduced" 
lunch_red = data_all[lunch_red_rows]

print("Exams results comparison based on lunch type:")
# Calculate the statistical significance for each exam based on student's lunch type
(t, p) = scipy.stats.ttest_ind(lunch_st['math score'], lunch_red['math score'])
print("P-Value for the Math score: "+str(p))
(t, p) = scipy.stats.ttest_ind(lunch_st['reading score'], lunch_red['reading score'])
print("P-Value for the Reading score: "+str(p))
(t, p) = scipy.stats.ttest_ind(lunch_st['writing score'], lunch_red['writing score'])
print("P-Value for the Writing score: "+str(p))

Exams results comparison based on lunch type:
P-Value for the Math score: 2.4131955993137074e-30
P-Value for the Reading score: 2.0027966545279011e-13
P-Value for the Writing score: 3.186189583166477e-15


As all P-Values are very small, let's draw a boxplot to see the visual distribution.

In [100]:
# Create a variable to hold the values to be ploted
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Box( y=lunch_st['math score'], name="Math; standard lunch", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=lunch_red['math score'], name="Math; free/reduced lunch", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=lunch_st['reading score'], name="Reading; standard lunch", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=lunch_red['reading score'], name="Reading; free/reduced lunch", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=lunch_st['writing score'], name="Writing; standard lunch", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=lunch_red['writing score'], name="Writing; free/reduced lunch", showlegend=False, marker_color = '#007f7f' ) )

# Layout, descriptions of the plot's style

layout = Layout(
    title="Exam scores based on lunch type",
    xaxis=dict(
        title="Exam type",
        titlefont=dict(
            size=18,
            #color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Score",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

Students with standart lunch show better results in all exams.

In [18]:
# Looking at the exam scores difference based on gender
# Choosing the rows where students are males
male_rows = data_all["gender"] == "male"
male = data_all[male_rows]

# Choosing the rows where students are females
female_rows = data_all["gender"] == "female" 
female = data_all[female_rows]

print("Exams results comparison based on gender:")
# Calculate the statistical significance for each exam based on completion of test preparation course
(t, p) = scipy.stats.ttest_ind(male['math score'], female['math score'])
print("P-Value for the Math score: "+str(p))
(t, p) = scipy.stats.ttest_ind(male['reading score'], female['reading score'])
print("P-Value for the Reading score: "+str(p))
(t, p) = scipy.stats.ttest_ind(male['writing score'], female['writing score'])
print("P-Value for the Writing score: "+str(p))

Exams results comparison based on gender:
P-Value for the Math score: 9.120185549328822e-08
P-Value for the Reading score: 4.680538743933289e-15
P-Value for the Writing score: 2.019877706867934e-22


In [101]:
# Create a variable to hold the values to be ploted
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Box( y=male['math score'], name="Math; male", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=female['math score'], name="Math; female", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=male['reading score'], name="Reading; male", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=female['reading score'], name="Reading; female", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=male['writing score'], name="Writing; male", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=female['writing score'], name="Writing; female", showlegend=False, marker_color = '#007f7f' ) )

# Layout, descriptions of the plot's style

layout = Layout(
    title="Exam scores based on gender",
    xaxis=dict(
        title="Exam type",
        titlefont=dict(
            size=18,
            #color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Score",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

As P-Values show significant differences between genders in all exams, it is interesting to notice, that males show better results in Math while females do better in reading and writing exams.

In [20]:
# Looking at the exam scores difference based on completion of test preparation course
# Choosing the rows where test preparation course type is "none"
test_none_rows = data_all["test preparation course"] == "none"
test_none = data_all[test_none_rows]

# Choosing the rows where test preparation course type is "completed"
test_comp_rows = data_all["test preparation course"] == "completed" 
test_comp = data_all[test_comp_rows]

print("Exams results comparison based on completion of a preparation test:")
# Calculate the statistical significance for each exam based on completion of test preparation course
(t, p) = scipy.stats.ttest_ind(test_none['math score'], test_comp['math score'])
print("P-Value for the Math score: "+str(p))
(t, p) = scipy.stats.ttest_ind(test_none['reading score'], test_comp['reading score'])
print("P-Value for the Reading score: "+str(p))
(t, p) = scipy.stats.ttest_ind(test_none['writing score'], test_comp['writing score'])
print("P-Value for the Writing score: "+str(p))

Exams results comparison based on completion of a preparation test:
P-Value for the Math score: 1.5359134607147415e-08
P-Value for the Reading score: 9.081783336892205e-15
P-Value for the Writing score: 3.68529173524572e-24


In [102]:
# Create a variable to hold the values to be ploted
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Box( y=test_none['math score'], name="Math; -preparation", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=test_comp['math score'], name="Math; test: +preparation", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=test_none['reading score'], name="Reading; -preparation", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=test_comp['reading score'], name="Reading; +preparation", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=test_none['writing score'], name="Writing; -preparation", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=test_comp['writing score'], name="Writing; +preparation", showlegend=False, marker_color = '#007f7f' ) )

# Layout, descriptions of the plot's style

layout = Layout(
    title="Exam scores based on completion of test preparation course",
    xaxis=dict(
        title="Exam type",
        titlefont=dict(
            size=18,
            #color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Score",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

Students who completed the preparation test get higher scores in all exams, with the biggest score increase in writing exam.

In [84]:
#Looking on exam scores (mean, sd) based on parental education level
data_all.groupby(['parental level of education'])['math score', 'reading score', 'writing score'].agg(['mean', 'std']).round(1)

Unnamed: 0_level_0,math score,math score,reading score,reading score,writing score,writing score
Unnamed: 0_level_1,mean,std,mean,std,mean,std
parental level of education,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
associate's degree,67.9,15.1,70.9,13.9,69.9,14.3
bachelor's degree,69.4,14.9,73.0,14.3,73.4,14.7
high school,62.1,14.5,64.7,14.1,62.4,14.1
master's degree,69.7,15.2,75.4,13.8,75.7,13.7
some college,67.1,14.3,69.5,14.1,68.8,15.0
some high school,63.5,15.9,66.9,15.5,64.9,15.7


Largest difference can be seen in writing scores. Let's check if these differences are statistically significant.

In [79]:
# Histogram of students writing scores based on their parents education level (to check whether the data is normal)
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Histogram( x=data_all['writing score'][data_all['parental level of education'] == "associate's degree"], name="associate's degree", showlegend=True ) )
to_plot.append(Histogram( x=data_all['writing score'][data_all['parental level of education'] == "bachelor's degree"], name="bachelor's degree", showlegend=True ) )
to_plot.append(Histogram( x=data_all['writing score'][data_all['parental level of education'] == "high school"], name="high school", showlegend=True ) )
to_plot.append(Histogram( x=data_all['writing score'][data_all['parental level of education'] == "master's degree"], name="master's degree", showlegend=True ) )
to_plot.append(Histogram( x=data_all['writing score'][data_all['parental level of education'] == "some college"], name="some college", showlegend=True ) )
to_plot.append(Histogram( x=data_all['writing score'][data_all['parental level of education'] == "some high school"], name="some high school", showlegend=True ) ) 

# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(
    title="Histogram of students writing scores based on their parents education level",
    xaxis=dict(
        title="Writing score",
        titlefont=dict(
            size=18,
            color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Counts",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

In [71]:
#Data are distributed normally, so we can do one-way ANOVA
stats.f_oneway(data_all['writing score'][data_all['parental level of education'] == "associate's degree"], 
             data_all['writing score'][data_all['parental level of education'] == "bachelor's degree"], 
             data_all['writing score'][data_all['parental level of education'] == "high school"], 
             data_all['writing score'][data_all['parental level of education'] == "master's degree"], 
             data_all['writing score'][data_all['parental level of education'] == "some college"], 
             data_all['writing score'][data_all['parental level of education'] == "some high school"])

F_onewayResult(statistic=14.442416127574988, pvalue=1.1202799969771148e-13)

In [97]:
#ANOVA shows statistical significance, now trying see in which groups it is (Tukey test)
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

mc = MultiComparison(data_all['writing score'], data_all['parental level of education'])
mc_results = mc.tukeyhsd()
print(mc_results)

            Multiple Comparison of Means - Tukey HSD, FWER=0.05             
      group1             group2      meandiff p-adj   lower    upper  reject
----------------------------------------------------------------------------
associate's degree bachelor's degree    3.485 0.2988  -1.2997  8.2696  False
associate's degree       high school  -7.4474  0.001 -11.5637 -3.3311   True
associate's degree   master's degree   5.7816 0.0797  -0.3699  11.933  False
associate's degree      some college  -1.0557    0.9  -5.0243  2.9129  False
associate's degree  some high school  -5.0081 0.0095   -9.227 -0.7893   True
 bachelor's degree       high school -10.9324  0.001 -15.8259 -6.0389   True
 bachelor's degree   master's degree   2.2966    0.9  -4.3998  8.9931  False
 bachelor's degree      some college  -4.5406 0.0728  -9.3105  0.2293  False
 bachelor's degree  some high school  -8.4931  0.001 -13.4732  -3.513   True
       high school   master's degree   13.229  0.001   6.9925 19.4655   True

Based on given P-Values we can see that students whose parents did not have any additional education after high school (categories "high school" and "some high school") got lower grades during writing exams compared to almost all other groups.

In [85]:
#Looking on exam scores based on students race/ethnicity
data_all.groupby(['race/ethnicity'])['math score', 'reading score', 'writing score'].agg(['mean', 'std']).round(1)

Unnamed: 0_level_0,math score,math score,reading score,reading score,writing score,writing score
Unnamed: 0_level_1,mean,std,mean,std,mean,std
race/ethnicity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
group A,61.6,14.5,64.7,15.5,62.7,15.5
group B,63.5,15.5,67.4,15.2,65.6,15.6
group C,64.5,14.9,69.1,14.0,67.8,15.0
group D,67.4,13.8,70.0,13.9,70.1,14.4
group E,73.8,15.5,73.0,14.9,71.4,15.1


This time largest difference can be seen in Math scores. Let's check if these differences are statistically significant.

In [80]:
# Drawing a histogram showing math scores for separate race/ethnicity groups
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Histogram( x=data_all['math score'][data_all['race/ethnicity'] == 'group A'], name="group A", showlegend=True ) )
to_plot.append(Histogram( x=data_all['math score'][data_all['race/ethnicity'] == 'group B'], name="group B", showlegend=True ) )
to_plot.append(Histogram( x=data_all['math score'][data_all['race/ethnicity'] == 'group C'], name="group C", showlegend=True ) )
to_plot.append(Histogram( x=data_all['math score'][data_all['race/ethnicity'] == 'group D'], name="group D", showlegend=True ) )
to_plot.append(Histogram( x=data_all['math score'][data_all['race/ethnicity'] == 'group E'], name="group E", showlegend=True ) )

# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(
    title="Histogram plot of Math score based on students race/ethnicity",
    xaxis=dict(
        title="Math score",
        titlefont=dict(
            size=18,
            color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Counts",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

In [81]:
#All groups are distributed normally, so we can do one-way ANOVA
stats.f_oneway(data_all['math score'][data_all['race/ethnicity'] == 'group A'], 
             data_all['math score'][data_all['race/ethnicity'] == 'group B'],
             data_all['math score'][data_all['race/ethnicity'] == 'group C'],
             data_all['math score'][data_all['race/ethnicity'] == 'group D'],                       
             data_all['math score'][data_all['race/ethnicity'] == 'group E'])

F_onewayResult(statistic=14.593885166332635, pvalue=1.3732194030370688e-11)

In [70]:
#ANOVA shows statistical significance, doing Tukey test to see in between which groups the differences in Math score is significant
mc = MultiComparison(data_all['math score'], data_all['race/ethnicity'])
mc_results = mc.tukeyhsd()
print(mc_results)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2 meandiff p-adj   lower   upper  reject
------------------------------------------------------
group A group B   1.8234 0.8597   -3.36  7.0068  False
group A group C   2.8347 0.4966 -2.0028  7.6723  False
group A group D   5.7334 0.0138  0.7824 10.6844   True
group A group E  12.1922  0.001  6.7215 17.6629   True
group B group C   1.0113    0.9 -2.6867  4.7094  False
group B group D     3.91 0.0441  0.0647  7.7552   True
group B group E  10.3688  0.001  5.8741 14.8635   True
group C group D   2.8986 0.1287 -0.4659  6.2632  False
group C group E   9.3575  0.001  5.2665 13.4485   True
group D group E   6.4588  0.001  2.2343 10.6834   True
------------------------------------------------------


We can see that students from group E have significantly higher Math scores when compared to all other groups, as well as students from group D (with the exception when compared to group C students).

In [94]:
#Doing bivariate analysis for all categorical variables clusters could be seen only when separation was based on "gender"
#Therefore decided to show PCA analysis based on this variable.

#Before doing a PCA let's remember what was the means for all exams for males and females separately
data_all.groupby(['gender'])['math score', 'reading score', 'writing score'].agg(['mean', 'std']).round(1)

Unnamed: 0_level_0,math score,math score,reading score,reading score,writing score,writing score
Unnamed: 0_level_1,mean,std,mean,std,mean,std
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,63.6,15.5,72.6,14.4,72.5,14.8
male,68.7,14.4,65.5,13.9,63.3,14.1


In [60]:
#Creating a smaller dataset, having only a gender column and all three exams scores
data_cluster = data_all.iloc[:,[0, 5, 6, 7]]
data_cluster.head()

Unnamed: 0,gender,math score,reading score,writing score
0,female,72,72,74
1,female,69,90,88
2,female,90,95,93
3,male,47,57,44
4,male,76,78,75


In [87]:
import plotly.figure_factory as ff

# Bivariate relationships based on gender
fig = ff.create_scatterplotmatrix(data_cluster, diag='histogram', index='gender', height=800, width=800)
plotly.offline.iplot(fig, filename='Histograms along Diagonal Subplots')

In [93]:
#Drawing a two-dimensional PCA plot
X_reduced = PCA(n_components=2).fit_transform(data_cluster.iloc[:, 1:])#Do not select the first column, which contains text

data_cluster['gender'] = pd.Categorical(data_cluster['gender'])# This converts the first cloumn variable to categorical (for assigning colors)
cols = data_cluster['gender'].cat.codes # This creates numerical color codes for the species variable

trace = Scatter(x=X_reduced[:, 0], y=X_reduced[:, 1],mode='markers',marker=dict(size=6, color=cols, opacity=0.8))

layout=Layout(title='First two PCA directions', scene=dict(xaxis=dict(title='1st eigenvector'), yaxis=dict(title='2nd eigenvector')))
fig = Figure(data=[trace], layout=layout)
#index='gender'
plotly.offline.iplot(fig)

In [56]:
#Drawing a three-dimensional PCA plot
X_reduced = PCA(n_components=3).fit_transform(data_cluster.iloc[:, 1:])#Do not select the gender column, which contains text

data_cluster['gender'] = pd.Categorical(data_cluster['gender'])# Converts the gender variable to categorical (for assigning colors)
cols = data_cluster['gender'].cat.codes # This creates numerical color codes for the gender variable

trace = Scatter3d(x=X_reduced[:, 0], 
                     y=X_reduced[:, 1], 
                     z=X_reduced[:, 2],
                     mode='markers',
                     marker=dict(
                         size=6,
                         color=cols,
                         colorscale='Viridis', 
                         opacity=0.8)
                    )
layout=Layout(title='First three PCA directions',
                 scene=dict(
                         xaxis=dict(title='1st eigenvector'),
                         yaxis=dict(title='2nd eigenvector'),
                         zaxis=dict(title='3rd eigenvector'))
                 )
fig = Figure(data=[trace], layout=layout)

plotly.offline.iplot(fig)


Two clusters can be seen when comparing all three tests results for males and females separately.