# EDA on jupyter

In [1]:
import pandas as pd
import numpy as np
import altair as alt
from selenium import webdriver
from docopt import docopt

## Research Question

The analysis will focus on answering the following inferential research questions:

- Do secondary school students who are in a relationship have different grades for Maths than students who are not?
- Do secondary school students who are in a relationship have different grades for Portugese than students who are not?

## Theme setting for altair

In [2]:
def mds_special():
    font = "Arial"
    axisColor = "#000000"
    gridColor = "#DEDDDD"
    return {
        
        "config": {
            "title": {
                "fontSize": 24,
                "font": font,
                "anchor": "start", # equivalent of left-aligned.
                "fontColor": "#000000"
            },
            "background": "white",
            "axisX": {
                "domain": True,
                #"domainColor": axisColor,
                "gridColor": gridColor,
                "domainWidth": 1,
                "grid": False,
                "labelFont": font,
                "labelFontSize": 12,
                "labelAngle": 0, 
                #"tickColor": axisColor,
                "tickSize": 5, # default, including it just to show you can change it
                #"titleFont": font,
                "titleFontSize": 18,
                "titlePadding": 10, # guessing, not specified in styleguide
                "title": "X Axis Title (units)", 
            },
            "axisY": {
                "domain": False,
                "grid": True,
                "gridColor": gridColor,
                "gridWidth": 1,
                "labelFont": font,
                "labelFontSize": 12,
                "labelAngle": 0, 
                #"ticks": False, # even if you don't have a "domain" you need to turn these off.
                "titleFont": font,
                "titleFontSize": 18,
                "titlePadding": 10, # guessing, not specified in styleguide
                "title": "Y Axis Title (units)", 
                # titles are by default vertical left of axis so we need to hack this 
                #"titleAngle": 0, # horizontal
                #"titleY": -10, # move it up
                #"titleX": 18, # move it to the right so it aligns with the labels 
            },
                }
            }
    
# register the custom theme under a chosen name
alt.themes.register('mds_special', mds_special)

# enable the newly registered theme
alt.themes.enable('mds_special')

ThemeRegistry.enable('mds_special')

## Read data

In [3]:
df_mat = pd.read_csv("../data/student-mat_clean.csv")
df_por = pd.read_csv("../data/student-por_clean.csv")

#### Head and tail of math dataset

In [4]:
df_mat.head()

Unnamed: 0,sex,romantic,total_grade
0,Female,no,17
1,Female,no,16
2,Female,no,25
3,Female,yes,44
4,Female,no,26


In [5]:
df_mat.tail()

Unnamed: 0,sex,romantic,total_grade
390,Male,no,27
391,Male,no,46
392,Male,no,25
393,Male,no,33
394,Male,no,26


#### Head and tail of Portuguese dataset

In [6]:
df_por.head()

Unnamed: 0,sex,romantic,total_grade
0,Female,no,22
1,Female,no,31
2,Female,no,37
3,Female,yes,42
4,Female,no,37


In [7]:
df_por.tail()

Unnamed: 0,sex,romantic,total_grade
644,Female,no,31
645,Female,no,46
646,Female,no,32
647,Male,no,30
648,Male,no,32


## Statistics on the features of interest

In [8]:
df_mat.isnull().any()

sex            False
romantic       False
total_grade    False
dtype: bool

> No null values

In [39]:
by_grade_rel = df_mat[["romantic", "total_grade"]].groupby("romantic").agg(['count', 'mean', 'std'])
by_grade_rel['total_grade'].reset_index().round(4).to_csv(ign)

Unnamed: 0,romantic,count,mean,std
0,no,263,32.8441,11.0268
1,yes,132,30.4318,11.0834


In [41]:
by_grade_por = df_por[["romantic", "total_grade"]].groupby("romantic").agg(['count', 'mean', 'std'])
by_grade_por['total_grade'].reset_index().round(4)

Unnamed: 0,romantic,count,mean,std
0,no,410,35.4732,8.237
1,yes,239,33.8494,8.8571


In [26]:
print("{} math students were in relationships and {} were not.".format(
    df_mat['romantic'].value_counts()['yes'], 
    df_mat['romantic'].value_counts()['no']))

print("{} portuguese language students were in relationships and {} were not.".format(
    df_por['romantic'].value_counts()['yes'], 
    df_por['romantic'].value_counts()['no']))

132 math students were in relationships and 263 were not.
239 portuguese language students were in relationships and 410 were not.


In [27]:
print("The average total grade for math students in relationships was: {:.2f}/60".format(
    df_mat[df_mat['romantic'] == 'yes']['total_grade'].mean()))
print("The average total grade for math students not in relationships was: {:.2f}/60".format(
    df_mat[df_mat['romantic'] == 'no']['total_grade'].mean()))
print("The average total grade for portuguese students in relationships was: {:.2f}/60".format(
    df_por[df_por['romantic'] == 'yes']['total_grade'].mean()))
print("The average total grade for portuguese students not in relationships was: {:.2f}/60".format(
    df_por[df_por['romantic'] == 'no']['total_grade'].mean()))

The average total grade for math students in relationships was: 30.43/60
The average total grade for math students not in relationships was: 32.84/60
The average total grade for portuguese students in relationships was: 33.85/60
The average total grade for portuguese students not in relationships was: 35.47/60


## Plots

In [31]:
p_1_1 = alt.Chart(df_mat[df_mat['romantic']=="yes"]).transform_density(
    'total_grade',
    as_=['total_grade', 'density'],
).mark_bar().encode(
    x=alt.X("total_grade:Q", title="Total grade", bin = alt.Bin(extent=[0, 60], step=5)),
    y='density:Q',
).properties(
    width = 300,
    height = 400,
    title = "In relationship"
)
p_1_2 = alt.Chart(df_mat[df_mat['romantic']=="no"]).transform_density(
    'total_grade',
    as_=['total_grade', 'density'],
).mark_bar(color='orange').encode(
    x=alt.X("total_grade:Q", title="Total grade", bin = alt.Bin(extent=[0, 60], step=5)),
    y='density:Q',
).properties(
    width = 300,
    height = 400,
    title = "Not in relationship"
)

In [32]:
P_math = p_1_1 | p_1_2

In [33]:
P_math.configure_title(
    fontSize=14,
)

In [34]:
p_2_1 = alt.Chart(df_por[df_por['romantic']=="yes"]).transform_density(
    'total_grade',
    as_=['total_grade', 'density'],
).mark_bar().encode(
    x=alt.X("total_grade:Q", title="Total grade", bin = alt.Bin(extent=[0, 60], step=5)),
    y='density:Q',
).properties(
    width = 300,
    height = 400,
    title = "In relationship"
)
p_2_2 = alt.Chart(df_por[df_por['romantic']=="no"]).transform_density(
    'total_grade',
    as_=['total_grade', 'density'],
).mark_bar(color='orange').encode(
    x=alt.X("total_grade:Q", title="Total grade", bin = alt.Bin(extent=[0, 60], step=5)),
    y='density:Q',
).properties(
    width = 300,
    height = 400,
    title = "Not in relationship"
)

In [35]:
P_por = p_1_1 | p_1_2

In [36]:
P_por.configure_title(
    fontSize=14,
)