In [2]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import config
from scipy.stats.stats import ttest_ind

%matplotlib inline
from sqlalchemy import create_engine
import warnings

warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [5]:
postgres_user = config.user
postgres_pw = config.password
postgres_host = config.host
postgres_port = config.port
postgres_db = "studentsperformance"

engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)

ed_df = pd.read_sql_query("select * from studentsperformance", con=engine)

# No need for an open connection,
# because you're only doing a single query
engine.dispose()

<IPython.core.display.Javascript object>

## 1. Are there any differences between the genders, ethnicities and parental level of education with respect to their performances in exams?

In [12]:
ed_df.groupby("gender").mean()

Unnamed: 0_level_0,math score,reading score,writing score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,63.633205,72.608108,72.467181
male,68.728216,65.473029,63.311203


<IPython.core.display.Javascript object>

In [15]:
ttest_ind(
    ed_df[ed_df["gender"] == "female"][
        ["math score", "reading score", "writing score"]
    ],
    ed_df[ed_df["gender"] == "male"][["math score", "reading score", "writing score"]],
)

Ttest_indResult(statistic=array([-5.38324587,  7.95930801,  9.97955791]), pvalue=array([9.12018555e-08, 4.68053874e-15, 2.01987771e-22]))

<IPython.core.display.Javascript object>

In [9]:
ed_df["race/ethnicity"].value_counts()

group C    319
group D    262
group B    190
group E    140
group A     89
Name: race/ethnicity, dtype: int64

<IPython.core.display.Javascript object>

In [16]:
ed_df.groupby("race/ethnicity").mean()

Unnamed: 0_level_0,math score,reading score,writing score
race/ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
group A,61.629213,64.674157,62.674157
group B,63.452632,67.352632,65.6
group C,64.46395,69.103448,67.827586
group D,67.362595,70.030534,70.145038
group E,73.821429,73.028571,71.407143


<IPython.core.display.Javascript object>

In [22]:
race_eth = ed_df['race/ethnicity'].unique()
print(race_eth)


['group B' 'group C' 'group A' 'group D' 'group E']


<IPython.core.display.Javascript object>

In [24]:
ed_df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


<IPython.core.display.Javascript object>

In [29]:
# T-test between each combination of race/ethincity group to look for significant differences
# flag the values that are pvalue < 0.05

for var in ed_df.columns[[5, 6, 7]]:
    print('------------------------------------------')
    print('Comparisons for variable: {}'.format(var))
    print('------------------------------------------')
    for i in range(0, len(race_eth)):
        for j in range(i+1, len(race_eth)):
            print('t-test between groups {0} and {1}:'.format(race_eth[i], race_eth[j]))
            print( ttest_ind(ed_df[ed_df['race/ethnicity'] == race_eth[i]][var],
                             ed_df[ed_df['race/ethnicity'] == race_eth[j]][var]))
            _, pvalue = ttest_ind(ed_df[ed_df['race/ethnicity'] == race_eth[i]][var],
                             ed_df[ed_df['race/ethnicity'] == race_eth[j]][var])
            if round(pvalue, 2) < 0.05:
                print('---> Groups {0} and {1} are significantly different'.format(race_eth[i], race_eth[j]))
            
            


------------------------------------------
Comparisons for variable: math score
------------------------------------------
t-test between groups group B and group C:
Ttest_indResult(statistic=-0.7315669893534263, pvalue=0.4647708939167453)
t-test between groups group B and group A:
Ttest_indResult(statistic=0.9355076279747488, pvalue=0.3503415961659957)
t-test between groups group B and group D:
Ttest_indResult(statistic=-2.82845539712675, pvalue=0.004886197137104194)
---> Groups group B and group D are significantly different
t-test between groups group B and group E:
Ttest_indResult(statistic=-6.007397050552227, pvalue=5.007946047497971e-09)
---> Groups group B and group E are significantly different
t-test between groups group C and group A:
Ttest_indResult(statistic=1.5997220303217299, pvalue=0.11043810745588042)
t-test between groups group C and group D:
Ttest_indResult(statistic=-2.41858624746011, pvalue=0.015888349556016285)
---> Groups group C and group D are significantly diff

<IPython.core.display.Javascript object>

* For the Math Scores:
    * group A is significantly different with:
        * D and E
    * group B is significantly different with:
        * D and E
    * group C is significantly different with:
        * D and E
    * group D is significantly different with:
        * E
* For the Reading Scores:
    * group A is significantly different with:
        * C, D and E
    * group B is significantly different with:
        * D and E
    * group C is significantly different with:
        * E
* For the Writing Scores:
    * group A is significantly different with:
        * C, D and E 
    * group B is significantly different with:
        * D and E
    * group C is significantly different with:
        * E

In [30]:
ed_df.groupby("parental level of education").mean()[
    ["math score", "reading score", "writing score"]
]

Unnamed: 0_level_0,math score,reading score,writing score
parental level of education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
associate's degree,67.882883,70.927928,69.896396
bachelor's degree,69.389831,73.0,73.381356
high school,62.137755,64.704082,62.44898
master's degree,69.745763,75.372881,75.677966
some college,67.128319,69.460177,68.840708
some high school,63.497207,66.938547,64.888268


<IPython.core.display.Javascript object>

In [32]:
parent_ed = ed_df["parental level of education"].unique()
print(parent_ed)

["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']


<IPython.core.display.Javascript object>

In [34]:
# T-test between each combination of race/ethincity group to look for significant differences
# flag the values that are pvalue < 0.05

for var in ed_df.columns[[5, 6, 7]]:
    print("------------------------------------------")
    print("Comparisons for variable: {}".format(var))
    print("------------------------------------------")
    for i in range(0, len(parent_ed)):
        for j in range(i + 1, len(parent_ed)):
            print(
                "t-test between groups {0} and {1}:".format(parent_ed[i], parent_ed[j])
            )
            print(
                ttest_ind(
                    ed_df[ed_df["parental level of education"] == parent_ed[i]][var],
                    ed_df[ed_df["parental level of education"] == parent_ed[j]][var],
                )
            )
            _, pvalue = ttest_ind(
                ed_df[ed_df["parental level of education"] == parent_ed[i]][var],
                ed_df[ed_df["parental level of education"] == parent_ed[j]][var],
            )
            if round(pvalue, 2) < 0.05:
                print(
                    "---> Groups {0} and {1} are significantly different".format(
                        parent_ed[i], parent_ed[j]
                    )
                )

------------------------------------------
Comparisons for variable: math score
------------------------------------------
t-test between groups bachelor's degree and some college:
Ttest_indResult(statistic=1.3702352829617477, pvalue=0.17151224218471559)
t-test between groups bachelor's degree and master's degree:
Ttest_indResult(statistic=-0.14868196568849312, pvalue=0.8819757824867702)
t-test between groups bachelor's degree and associate's degree:
Ttest_indResult(statistic=0.8786641889793243, pvalue=0.38020757818036177)
t-test between groups bachelor's degree and high school:
Ttest_indResult(statistic=4.236143622484822, pvalue=2.9965345853843376e-05)
---> Groups bachelor's degree and high school are significantly different
t-test between groups bachelor's degree and some high school:
Ttest_indResult(statistic=3.196718602676573, pvalue=0.0015409192922812146)
---> Groups bachelor's degree and some high school are significantly different
t-test between groups some college and master's 

---> Groups associate's degree and high school are significantly different
t-test between groups associate's degree and some high school:
Ttest_indResult(statistic=3.3317223453475715, pvalue=0.0009435990258795855)
---> Groups associate's degree and some high school are significantly different
t-test between groups high school and some high school:
Ttest_indResult(statistic=-1.5838885273076473, pvalue=0.11406653932963291)


<IPython.core.display.Javascript object>

T-test between these groups shows that there are signicant differences between each group and any education level at least two steps in any direction. For example, there is a sig dif btwn scores of student's with parents of a 'Master's' level education and scores of students with parents of an 'Associate's' level education and anything less than that, but there is no sig dif btwn scores of students with parents of a 'Master's' level education and the scores of students with parents with a 'Bachelor's' level education.

## 2. Are there any differences between the lunch types with respect to their performances in exams? If there are, how do you explain this?

In [35]:
ed_df.groupby("lunch").mean()[["math score", "reading score", "writing score"]]

Unnamed: 0_level_0,math score,reading score,writing score
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free/reduced,58.921127,64.653521,63.022535
standard,70.034109,71.654264,70.823256


<IPython.core.display.Javascript object>

In [38]:
_, pvalue = ttest_ind(
    ed_df[ed_df["lunch"] == "standard"][ed_df.columns[[5, 6, 7]]],
    ed_df[ed_df["lunch"] == "free/reduced"][ed_df.columns[[5, 6, 7]]],
)

<IPython.core.display.Javascript object>

In [39]:
print(pvalue<0.05)


[ True  True  True]


<IPython.core.display.Javascript object>

There is a significant difference between these two groups in all three subjects. The standard lunch students are more successful.

## 3. Does the test preparation course seem to have an effect on the exam performances?

In [40]:
ed_df.groupby("test preparation course").mean()[
    ["math score", "reading score", "writing score"]
]

Unnamed: 0_level_0,math score,reading score,writing score
test preparation course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
completed,69.695531,73.893855,74.418994
none,64.077882,66.534268,64.504673


<IPython.core.display.Javascript object>

In [41]:
_, pvalue = ttest_ind(
    ed_df[ed_df["test preparation course"] == "completed"][ed_df.columns[[5, 6, 7]]],
    ed_df[ed_df["test preparation course"] == "none"][ed_df.columns[[5, 6, 7]]],
)

<IPython.core.display.Javascript object>

In [42]:
print(pvalue < 0.05)

[ True  True  True]


<IPython.core.display.Javascript object>

The students that attended a prep course are more successful in all subjects.

## 4. Which two exam scores are correlated the most with each other?

In [43]:
ed_df[ed_df.columns[[5, 6, 7]]].corr()

Unnamed: 0,math score,reading score,writing score
math score,1.0,0.81758,0.802642
reading score,0.81758,1.0,0.954598
writing score,0.802642,0.954598,1.0


<IPython.core.display.Javascript object>

Reading and Writing scores are the most correlated.