In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from scipy.stats.stats import ttest_ind
import warnings

warnings.filterwarnings('ignore')

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'studentsperformance'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

performance = pd.read_sql_query('SELECT * FROM studentsperformance', con=engine)

engine.dispose()

### 1) Are there any differences between the genders, ethnicities, and parental level of education with respect to their performances in exams?

In [3]:
performance.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
gender_group = performance.groupby('gender').mean()

ethnicity_group = performance.groupby('race/ethnicity').mean()

parent_group = performance.groupby('parental level of education').mean()

In [5]:
print(gender_group)
print(ethnicity_group)
print(parent_group)

        math score  reading score  writing score
gender                                          
female   63.633205      72.608108      72.467181
male     68.728216      65.473029      63.311203
                math score  reading score  writing score
race/ethnicity                                          
group A          61.629213      64.674157      62.674157
group B          63.452632      67.352632      65.600000
group C          64.463950      69.103448      67.827586
group D          67.362595      70.030534      70.145038
group E          73.821429      73.028571      71.407143
                             math score  reading score  writing score
parental level of education                                          
associate's degree            67.882883      70.927928      69.896396
bachelor's degree             69.389831      73.000000      73.381356
high school                   62.137755      64.704082      62.448980
master's degree               69.745763      75.372881 

In [6]:
list1 = ['gender','race/ethnicity', 'parental level of education']
for item in list1:
    print("------------------------------------------------")
    print("{}".format(item))
    print("------------------------------------------------")
    list2 = performance[item].unique()
    
    for var in ["math score", "reading score", "writing score"]:
        print("------------------------------------------------")
        print("Comparisons for variable: {}".format(var))
        print("------------------------------------------------")
        for i in range(0, len(list2)):
            for j in range(i+1, len(list2)):
                print(
                    "t-test between groups {0} and {1}:".format(list2[i], list2[j]))
                print(ttest_ind(
                    performance[performance[item]
                               == list2[i]][var],
                    performance[performance[item] == list2[j]][var]
                ))

------------------------------------------------
gender
------------------------------------------------
------------------------------------------------
Comparisons for variable: math score
------------------------------------------------
t-test between groups female and male:
Ttest_indResult(statistic=-5.383245869828983, pvalue=9.120185549328822e-08)
------------------------------------------------
Comparisons for variable: reading score
------------------------------------------------
t-test between groups female and male:
Ttest_indResult(statistic=7.959308005187657, pvalue=4.680538743933289e-15)
------------------------------------------------
Comparisons for variable: writing score
------------------------------------------------
t-test between groups female and male:
Ttest_indResult(statistic=9.979557910004507, pvalue=2.019877706867934e-22)
------------------------------------------------
race/ethnicity
------------------------------------------------
----------------------------

### 2) Are there any differences between the lunch types with respect to their performances in exams? If there are, how do you explain this?

In [7]:
lunch_group = performance.groupby('lunch').mean()
print(lunch_group)

              math score  reading score  writing score
lunch                                                 
free/reduced   58.921127      64.653521      63.022535
standard       70.034109      71.654264      70.823256


In [8]:
lunch = performance['lunch'].unique()
for var in ["math score", "reading score", "writing score"]:
    print("------------------------------------------------")
    print("Comparisons for variable: {}".format(var))
    print("------------------------------------------------")
    for i in range(0, len(lunch)):
        for j in range(i+1, len(lunch)):
            print(
                "t-test between groups {0} and {1}:".format(lunch[i], lunch[j]))
            print(ttest_ind(
                performance[performance['lunch']
                            == lunch[i]][var],
                performance[performance['lunch'] == lunch[j]][var]
            ))

------------------------------------------------
Comparisons for variable: math score
------------------------------------------------
t-test between groups standard and free/reduced:
Ttest_indResult(statistic=11.837180472914612, pvalue=2.4131955993137074e-30)
------------------------------------------------
Comparisons for variable: reading score
------------------------------------------------
t-test between groups standard and free/reduced:
Ttest_indResult(statistic=7.451056467473455, pvalue=2.0027966545279011e-13)
------------------------------------------------
Comparisons for variable: writing score
------------------------------------------------
t-test between groups standard and free/reduced:
Ttest_indResult(statistic=8.009784197834758, pvalue=3.186189583166477e-15)


The standard lunches have a noticeable correlation with doing better on all tests

### 3) Does the test preparation course seem to have an effect on the exam performances?

In [9]:
prep_group = performance.groupby('test preparation course').mean()
print(prep_group)

prep = performance['test preparation course'].unique()
for var in ["math score", "reading score", "writing score"]:
    print("------------------------------------------------")
    print("Comparisons for variable: {}".format(var))
    print("------------------------------------------------")
    for i in range(0, len(prep)):
        for j in range(i+1, len(prep)):
            print(
                "t-test between groups {0} and {1}:".format(prep[i], prep[j]))
            print(ttest_ind(
                performance[performance['test preparation course']
                            == prep[i]][var],
                performance[performance['test preparation course'] == prep[j]][var]
            ))

                         math score  reading score  writing score
test preparation course                                          
completed                 69.695531      73.893855      74.418994
none                      64.077882      66.534268      64.504673
------------------------------------------------
Comparisons for variable: math score
------------------------------------------------
t-test between groups none and completed:
Ttest_indResult(statistic=-5.704616417349102, pvalue=1.5359134607147415e-08)
------------------------------------------------
Comparisons for variable: reading score
------------------------------------------------
t-test between groups none and completed:
Ttest_indResult(statistic=-7.871663538941468, pvalue=9.081783336892205e-15)
------------------------------------------------
Comparisons for variable: writing score
------------------------------------------------
t-test between groups none and completed:
Ttest_indResult(statistic=-10.409173436808748,

The trend is that completing the test prep will yield better exam results

### 4) Which 2 exam scores are most correlated with each other?

In [10]:
performance.corr()

Unnamed: 0,math score,reading score,writing score
math score,1.0,0.81758,0.802642
reading score,0.81758,1.0,0.954598
writing score,0.802642,0.954598,1.0


Reading and wrting have a much closer correlation than any other two column combinations