In [46]:
#importing packages for statistical analysis and data plotting
import pandas as pd
print("Pandas imported successfully, version:"+pd.__version__)
import statsmodels as sm
print("Statsmodels imported successfully, version:"+sm.__version__)

import scipy
from scipy.stats import *
print("SciPy imported successfully, version: "+scipy.__version__)

import matplotlib
import plotly
print("Plotly imported successfully, version: "+plotly.__version__)
from plotly.graph_objs import * # for importing the different types of plots

from sklearn.decomposition import PCA #For performing principle component analysis

# The following are necessary to run plotly in an offline mode, because its otherwise an online plotting library
from plotly.graph_objs import Scatter, Layout
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import Box
from plotly.graph_objs import Histogram

Pandas imported successfully, version:0.25.1
Statsmodels imported successfully, version:0.10.1
SciPy imported successfully, version: 1.3.1
Plotly imported successfully, version: 4.4.1


In [34]:
#importing dataset
data_all=pd.read_csv('C:\\Users\Rimante\Documents\GitHub\Pirmas_projektas\StudentsPerformance.csv')

In [35]:
data_all.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [36]:
data_all.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [37]:
#more detailed info about the dataset
print(data_all.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
gender                         1000 non-null object
race/ethnicity                 1000 non-null object
parental level of education    1000 non-null object
lunch                          1000 non-null object
test preparation course        1000 non-null object
math score                     1000 non-null int64
reading score                  1000 non-null int64
writing score                  1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None


8 columns:
5 of discrete nominal data (object),
3 of continuous data (int64). No missing values.

In [38]:
#finding unique values for all discrete nominal variables
print(data_all['gender'].unique())
print(data_all['race/ethnicity'].unique())
print(data_all['parental level of education'].unique())
print(data_all['lunch'].unique())
print(data_all['test preparation course'].unique())

['female' 'male']
['group B' 'group C' 'group A' 'group D' 'group E']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
['standard' 'free/reduced']
['none' 'completed']


Variables "gender", "lunch" and "test preparation course" have 2 unique values each;
variable "race/ethnicity" has 5 and "parental level of education" - 6 unique values.

In [39]:
#looking at the counts of each unique value for all discrete nominal variables
print(data_all['gender'].value_counts())
print(data_all['race/ethnicity'].value_counts())
print(data_all['parental level of education'].value_counts())
print(data_all['lunch'].value_counts())
print(data_all['test preparation course'].value_counts())

female    518
male      482
Name: gender, dtype: int64
group C    319
group D    262
group B    190
group E    140
group A     89
Name: race/ethnicity, dtype: int64
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64
standard        645
free/reduced    355
Name: lunch, dtype: int64
none         642
completed    358
Name: test preparation course, dtype: int64


In [40]:
#description of continuos variables in the dataset
data_all.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [41]:
# Showing the values of all three exams scores in a box plot
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Box( y=data_all['math score'], name="Math", showlegend=True ) )
to_plot.append(Box( y=data_all['reading score'], name="Reading", showlegend=True ) )
to_plot.append(Box( y=data_all['writing score'], name="Writing", showlegend=True ) )

# Layout, descriptions of the plot's style
layout = Layout(
    title="Total exams scores",
    xaxis=dict(
        title="Exam type",
        titlefont=dict(
            size=18,
            color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Score",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

# This is where the plotting happens
# We call an offline plot module from the plotly library
# We tell it, that data to be plotted is contained in the to_plot variable and 
# the style of the plot is described in the layout variable

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

In [159]:
# Based on initial data boxplot, creating subsets without outliers (measureaments out of 'whiskers')
math_corrected = data_all['math score'] >= 27
read_corrected = data_all['reading score'] >= 29
writing_corrected = data_all['writing score'] >= 27

# Choosing the rows where lunch type is "standard" and omitting outlier's rows
lunch_st_rows = data_all["lunch"] == "standard"
lunch_st = data_all[lunch_st_rows&math_corrected&read_corrected&writing_corrected]

# Choosing the rows where lunch type is "free/reduced" and omitting outlier's rows
lunch_red_rows = data_all["lunch"] == "free/reduced" 
lunch_red = data_all[lunch_red_rows&math_corrected&read_corrected&writing_corrected]

# Calculate the statistical significance for each exam based on student's lunch type
(t, p) = scipy.stats.ttest_ind(lunch_st['math score'], lunch_red['math score'])
print("P-Value for the Math score: "+str(p))
(t, p) = scipy.stats.ttest_ind(lunch_st['reading score'], lunch_red['reading score'])
print("P-Value for the Reading score: "+str(p))
(t, p) = scipy.stats.ttest_ind(lunch_st['writing score'], lunch_red['writing score'])
print("P-Value for the Writing score: "+str(p))

P-Value for the Math score: 1.3787735494512243e-27
P-Value for the Reading score: 3.8517295952555e-11
P-Value for the Writing score: 8.340265947726074e-13


As all P-Values are very small, let's draw a boxplot to see the visual distribution.

In [175]:
# Create a variable to hold the values to be ploted
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Box( y=lunch_st['math score'], name="Math; standard lunch", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=lunch_red['math score'], name="Math; free/reduced lunch", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=lunch_st['reading score'], name="Reading; standard lunch", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=lunch_red['reading score'], name="Reading; free/reduced lunch", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=lunch_st['writing score'], name="Writing; standard lunch", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=lunch_red['writing score'], name="Writing; free/reduced lunch", showlegend=False, marker_color = '#007f7f' ) )

# Layout, descriptions of the plot's style

layout = Layout(
    title="Exams scores based on lunch type",
    xaxis=dict(
        title="Exam type",
        titlefont=dict(
            size=18,
            #color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Score",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

# This is where the plotting happens
# We call an offline plot module from the plotly library
# We tell it, that data to be plotted is contained in the to_plot variable and 
# the style of the plot is described in the layout variable

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

Students with standart lunch shows better results in all exams.

In [176]:
# Choosing the rows where test preparation course type is "none" and omitting outlier's rows
test_none_rows = data_all["test preparation course"] == "none"
test_none = data_all[test_none_rows&math_corrected&read_corrected&writing_corrected]

# Choosing the rows where test preparation course type is "completed" and omitting outlier's rows
test_comp_rows = data_all["test preparation course"] == "completed" 
test_comp = data_all[test_comp_rows&math_corrected&read_corrected&writing_corrected]

# Calculate the statistical significance for each exam based on completion of test preparation course
(t, p) = scipy.stats.ttest_ind(test_none['math score'], test_comp['math score'])
print("P-Value for the Math score: "+str(p))
(t, p) = scipy.stats.ttest_ind(test_none['reading score'], test_comp['reading score'])
print("P-Value for the Reading score: "+str(p))
(t, p) = scipy.stats.ttest_ind(test_none['writing score'], test_comp['writing score'])
print("P-Value for the Writing score: "+str(p))

P-Value for the Math score: 1.2562834777074078e-07
P-Value for the Reading score: 1.2458590488047568e-13
P-Value for the Writing score: 2.867004157881123e-23


All P-Values are <0.05 - let's look at their graphical distribution.

In [178]:
# Create a variable to hold the values to be ploted
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Box( y=test_none['math score'], name="Math; test: none", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=test_comp['math score'], name="Math; test: completed", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=test_none['reading score'], name="Reading; test: none", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=test_comp['reading score'], name="Reading; test: completed", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=test_none['writing score'], name="Writing; test: none", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=test_comp['writing score'], name="Writing; test: completed", showlegend=False, marker_color = '#007f7f' ) )

# Layout, descriptions of the plot's style

layout = Layout(
    title="Exams scores based on completion of preparation test",
    xaxis=dict(
        title="Exam type",
        titlefont=dict(
            size=18,
            #color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Score",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

# This is where the plotting happens
# We call an offline plot module from the plotly library
# We tell it, that data to be plotted is contained in the to_plot variable and 
# the style of the plot is described in the layout variable

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

There is a minor shift upwards in the score based on the completion of a test before the exams.

In [192]:
# Choosing the rows based on parental level of education
parent_1_rows = data_all["parental level of education"] == "some high school"
parent_2_rows = data_all["parental level of education"] == "high school"

parent_3_rows = data_all["parental level of education"] == "some college"
parent_4_rows = data_all["parental level of education"] == "associate's degree"

parent_5_rows = data_all["parental level of education"] == "bachelor's degree"
parent_6_rows = data_all["parental level of education"] == "master's degree"

# Grouping unique values into categories:
# High school education: "some high school"&"high school"
combined_data_1 = data_all[(parent_1_rows|parent_2_rows)&math_corrected&read_corrected&writing_corrected]
# Higher (non-university) education: "some collage"&"associate's degree"
combined_data_2 = data_all[(parent_3_rows|parent_4_rows)&math_corrected&read_corrected&writing_corrected]
# Completed education in University: "bachelor's degree"&"master's degree"
combined_data_3 = data_all[(parent_5_rows|parent_6_rows)&math_corrected&read_corrected&writing_corrected]

(t, p) = scipy.stats.ttest_ind(combined_data_1['math score'], combined_data_3['math score'])
print("P-Value for the Math (1,3) score: "+str(p))

(t, p) = scipy.stats.ttest_ind(combined_data_1['math score'], combined_data_2['math score'])
print("P-Value for the Math (1,2) score: "+str(p))

(t, p) = scipy.stats.ttest_ind(combined_data_2['math score'], combined_data_3['math score'])
print("P-Value for the Math (2,3) score: "+str(p))

(t, p) = scipy.stats.ttest_ind(combined_data_1['reading score'], combined_data_3['reading score'])
print("P-Value for the Reading (1,3) score: "+str(p))

(t, p) = scipy.stats.ttest_ind(combined_data_1['reading score'], combined_data_2['reading score'])
print("P-Value for the Reading (1,2) score: "+str(p))

(t, p) = scipy.stats.ttest_ind(combined_data_2['reading score'], combined_data_3['reading score'])
print("P-Value for the Reading (2,3) score: "+str(p))

(t, p) = scipy.stats.ttest_ind(combined_data_1['writing score'], combined_data_3['writing score'])
print("P-Value for the Writing (1,3) score: "+str(p))

(t, p) = scipy.stats.ttest_ind(combined_data_1['writing score'], combined_data_2['writing score'])
print("P-Value for the Writing (1,2) score: "+str(p))

(t, p) = scipy.stats.ttest_ind(combined_data_2['writing score'], combined_data_3['writing score'])
print("P-Value for the Writing (2,3) score: "+str(p))

P-Value for the Math (1,3) score: 8.64815359837507e-06
P-Value for the Math (1,2) score: 1.2941245150901778e-05
P-Value for the Math (2,3) score: 0.22852948185340025
P-Value for the Reading (1,3) score: 1.950664185579191e-08
P-Value for the Reading (1,2) score: 1.8815743123081115e-05
P-Value for the Reading (2,3) score: 0.009083570727990153
P-Value for the Writing (1,3) score: 1.511462725383079e-13
P-Value for the Writing (1,2) score: 5.5488742909521584e-08
P-Value for the Writing (2,3) score: 0.0006300338401750443


In [191]:
# Create a variable to hold the values to be ploted
to_plot = []
# This contains main dataset indexing and plot dataset creation in a single line
to_plot.append(Box( y=combined_data_1['math score'], name="M, High school", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=combined_data_2['math score'], name="M, Higher (non-university) education", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=combined_data_3['math score'], name="M, University education", showlegend=False, marker_color = '#3a88b5' ) )
to_plot.append(Box( y=combined_data_1['reading score'], name="R, High school", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=combined_data_2['reading score'], name="R, Higher (non-university) education", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=combined_data_3['reading score'], name="R, University education", showlegend=False, marker_color = '#3a88b5' ) )
to_plot.append(Box( y=combined_data_1['writing score'], name="W, High school", showlegend=False, marker_color = '#7f0000' ) )
to_plot.append(Box( y=combined_data_2['writing score'], name="W, Higher (non-university) education", showlegend=False, marker_color = '#007f7f' ) )
to_plot.append(Box( y=combined_data_3['writing score'], name="W, University education", showlegend=False, marker_color = '#3a88b5' ) )

# Layout, descriptions of the plot's style
layout = Layout(
    title="Total exams scores",
    xaxis=dict(
        title="Exam type",
        titlefont=dict(
            size=18,
            color='#7f0000'
        )
    ),
    yaxis=dict(
        title="Score",
        titlefont=dict(
            size=18,
            color='#007f7f'
        )
    ),
)

# This is where the plotting happens
# We call an offline plot module from the plotly library
# We tell it, that data to be plotted is contained in the to_plot variable and 
# the style of the plot is described in the layout variable

plotly.offline.iplot({
    "data": to_plot,
    "layout": layout
})

In [51]:
#determining the correlation between math and writing scores
(r, p) = pearsonr(data_all["math score"], data_all["writing score"])
print("Pearson R value is: %.5f"%r + " with a p-value of: %.5e"%p)

Pearson R value is: 0.80264 with a p-value of: 3.37603e-226


In [52]:
#drawing a linear regressiong plot
slope, intercept, r_value, p_value, std_err = stats.linregress(data_all["math score"], data_all["writing score"])
print("Pearson R value is: %.5f"%r_value + " with a p-value of: %.5e"%p_value)

# This part sets up the object, that contains your data to be plotted: lists of x and y values
# That object is called trace0
trace0 = Scatter(
    x = data_all["math score"],
    y = data_all["writing score"],
    mode = 'markers',
    name = 'Math and writing scores relationship',
)

line = slope*data_all["math score"]+intercept # Lets create a line, that uses x argument as x values
# trace1 holds the data for plotting the line
trace1 = Scatter(
    x = data_all["math score"],
    y = line,
    mode = 'lines',
    name = 'Line fit'
)
# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(title="Math and writing scores relationship",xaxis=dict(title="math score"),yaxis=dict(title="writing score"),)
plotly.offline.iplot({"data": [trace0, trace1],"layout": layout})

Pearson R value is: 0.80264 with a p-value of: 3.37603e-226


In [55]:
#determining the correlation between math and reading scores
(r, p) = pearsonr(data_all["math score"], data_all["reading score"])
print("Pearson R value is: %.5f"%r + " with a p-value of: %.5e"%p)

Pearson R value is: 0.81758 with a p-value of: 1.78775e-241


In [57]:
#drawing a linear regressiong plot
slope, intercept, r_value, p_value, std_err = stats.linregress(data_all["math score"], data_all["reading score"])
print("Pearson R value is: %.5f"%r_value + " with a p-value of: %.5e"%p_value)

# This part sets up the object, that contains your data to be plotted: lists of x and y values
# That object is called trace0
trace0 = Scatter(
    x = data_all["math score"],
    y = data_all["reading score"],
    mode = 'markers',
    name = 'Math and reading scores relationship',
)

line = slope*data_all["math score"]+intercept # Lets create a line, that uses x argument as x values
# trace1 holds the data for plotting the line
trace1 = Scatter(
    x = data_all["math score"],
    y = line,
    mode = 'lines',
    name = 'Line fit'
)
# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(title="Math and reading scores relationship",xaxis=dict(title="math score"),yaxis=dict(title="reading score"),)
plotly.offline.iplot({"data": [trace0, trace1],"layout": layout})

Pearson R value is: 0.81758 with a p-value of: 1.78775e-241


In [59]:
#determining the correlation between reading and writing scores
(r, p) = pearsonr(data_all["reading score"], data_all["writing score"])
print("Pearson R value is: %.5f"%r + " with a p-value of: %.5e"%p)

Pearson R value is: 0.95460 with a p-value of: 0.00000e+00


In [61]:
#drawing a linear regressiong plot
slope, intercept, r_value, p_value, std_err = stats.linregress(data_all["reading score"], data_all["writing score"])
print("Pearson R value is: %.5f"%r_value + " with a p-value of: %.5e"%p_value)

# This part sets up the object, that contains your data to be plotted: lists of x and y values
# That object is called trace0
trace0 = Scatter(
    x = data_all["reading score"],
    y = data_all["writing score"],
    mode = 'markers',
    name = 'Reading and writing scores relationship',
)

line = slope*data_all["reading score"]+intercept # Lets create a line, that uses x argument as x values
# trace1 holds the data for plotting the line
trace1 = Scatter(
    x = data_all["reading score"],
    y = line,
    mode = 'lines',
    name = 'Line fit'
)
# This part sets up the object, that contains descriptions of the plot's style
# That object is called layout
layout = Layout(title="Reading and writing scores relationship",xaxis=dict(title="reading score"),yaxis=dict(title="writing score"),)
plotly.offline.iplot({"data": [trace0, trace1],"layout": layout})

Pearson R value is: 0.95460 with a p-value of: 0.00000e+00


It can be seen, that the correlation between all exams are quite high, meaning that better prepared student tend to get higher grades for all exams. Moreover, there is a very strong correlation between reading and writing scores.

In [65]:
idx = data_all['gender'] == "male"
idx
standard = data_all[idx]
standard.describe(include='all')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
count,482,482,482,482,482,482.0,482.0,482.0
unique,1,5,6,2,2,,,
top,male,group C,some college,standard,none,,,
freq,482,139,108,316,308,,,
mean,,,,,,68.728216,65.473029,63.311203
std,,,,,,14.356277,13.931832,14.113832
min,,,,,,27.0,23.0,15.0
25%,,,,,,59.0,56.0,53.0
50%,,,,,,69.0,66.0,64.0
75%,,,,,,79.0,75.0,73.75


In [66]:
idx = data_all['gender'] == "female"
idx
free = data_all[idx]
free.describe(include='all')

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
count,518,518,518,518,518,518.0,518.0,518.0
unique,1,5,6,2,2,,,
top,female,group C,some college,standard,none,,,
freq,518,180,118,329,334,,,
mean,,,,,,63.633205,72.608108,72.467181
std,,,,,,15.491453,14.378245,14.844842
min,,,,,,0.0,17.0,10.0
25%,,,,,,54.0,63.25,64.0
50%,,,,,,65.0,73.0,74.0
75%,,,,,,74.0,83.0,82.0


In [193]:
data_cluster = data_all.iloc[:,[0, 5, 6, 7]]
data_cluster.head()

Unnamed: 0,gender,math score,reading score,writing score
0,female,72,72,74
1,female,69,90,88
2,female,90,95,93
3,male,47,57,44
4,male,76,78,75


In [194]:
import plotly.figure_factory as ff

# data_all has to be a pandas data structure, that contains a categorical variable for grouping the observations
# enther the name of this categorical variable with observations to the index parameter
fig = ff.create_scatterplotmatrix(data_cluster, diag='histogram', index='gender', height=800, width=800)
plotly.offline.iplot(fig, filename='Histograms along Diagonal Subplots')

In [195]:
X_reduced = PCA(n_components=2).fit_transform(data_cluster.iloc[:, 1:])#Do not select the first column, which contains text

data_cluster['gender'] = pd.Categorical(data_cluster['gender'])# This converts the first cloumn variable to categorical (for assigning colors)
cols = data_cluster['gender'].cat.codes # This creates numerical color codes for the species variable

trace = Scatter(x=X_reduced[:, 0], y=X_reduced[:, 1],mode='markers',marker=dict(size=6, color=cols, opacity=0.8))

layout=Layout(title='First two PCA directions',scene=dict(xaxis=dict(title='1st eigenvector'), yaxis=dict(title='2nd eigenvector')))
fig = Figure(data=[trace], layout=layout)
#index='gender'
plotly.offline.iplot(fig)

In [196]:
X_reduced = PCA(n_components=3).fit_transform(data_cluster.iloc[:, 1:])#Do not select the species column, which contains text

data_cluster['gender'] = pd.Categorical(data_cluster['gender'])# This converts the species variable to categorical (for assigning colors)
cols = data_cluster['gender'].cat.codes # This creates numerical color codes for the species variable

trace = Scatter3d(x=X_reduced[:, 0], 
                     y=X_reduced[:, 1], 
                     z=X_reduced[:, 2],
                     mode='markers',
                     marker=dict(
                         size=6,
                         color=cols,
                         colorscale='Viridis', 
                         opacity=0.8)
                    )
layout=Layout(title='First three PCA directions',
                 scene=dict(
                         xaxis=dict(title='1st eigenvector'),
                         yaxis=dict(title='2nd eigenvector'),
                         zaxis=dict(title='3rd eigenvector'))
                 )
fig = Figure(data=[trace], layout=layout)

plotly.offline.iplot(fig)
