In [3]:
import pandas as pd
import chardet
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Reading the dataset

In [4]:
# Load the dataset
with open('./School A.csv', 'rb') as f:
    result = chardet.detect(f.read())

a = pd.read_csv('./School A.csv', encoding=result['encoding'])

print(a.head())
print(a.tail())

     school sex famsize Pstatus             Momedu             Dadedu  \
0  School A   F     GT3       A  University Degree  University Degree   
1  School A   F     GT3       T     Primary School     Primary School   
2  School A   F     LE3       T     Primary School     Primary School   
3  School A   F     GT3       T  University Degree   Secondary School   
4  School A   F     GT3       T     Post-Secondary     Post-Secondary   

      Mjob      Fjob  traveltime (hrs) parentalhelp tuition higher romantic  \
0  at_home   teacher                 2           no      no    yes       no   
1  at_home     other                 1          yes      no    yes       no   
2  at_home     other                 1           no     yes    yes       no   
3   health  services                 1          yes     yes    yes      yes   
4    other     other                 1          yes     yes    yes       no   

   famrel  health  absences  mathscore  Unnamed: 17  
0       4       3         6     

# Analysis

In [5]:
# Find proprtions up to 2 dp
def getProportion(df, condition):
    return round(df[condition].shape[0] / df.shape[0], 2)

def printAnalysis(df):
    print("Family size more than 3: ", getProportion(df, lambda a: a['famsize'] == 'GT3'))
    print("Receiving tuition: ", getProportion(df, lambda a: a['tuition'] == 'yes'))
    print("Mothers with degree: ", getProportion(df, lambda a: a['Momedu'] == "University Degree" if "Momedu" in a else a["Medu"] == "University Degree"))
    print("In a romantic relationship: ", getProportion(df, lambda a: a['romantic'] == 'yes'))

printAnalysis(a)

Family size more than 3:  0.72
Receiving tuition:  0.46
Mothers with degree:  0.34
In a romantic relationship:  0.32


# Random Assignment

In [6]:
# Assign a new column to the dataframe, each student is randomly assigned 0 or 1
def split(df):
    df["Random assignment"] = random.choices([0, 1], k=df.shape[0])
    control = df[df["Random assignment"] == 0]
    treatment = df[df["Random assignment"] == 1]
    return control, treatment

control, treatment = split(a)
print("Students in control group: ", control.shape[0])
print("Students in treatment group: ", treatment.shape[0])

Students in control group:  171
Students in treatment group:  178


In [7]:
# Analyze the control group
print("Control group analysis")
printAnalysis(control)
print("======")
# Analyze the treatment group
print("Treatment group analysis")
printAnalysis(treatment)

Control group analysis
Family size more than 3:  0.7
Receiving tuition:  0.49
Mothers with degree:  0.35
In a romantic relationship:  0.31
Treatment group analysis
Family size more than 3:  0.75
Receiving tuition:  0.43
Mothers with degree:  0.34
In a romantic relationship:  0.34


In [9]:
# Analyse school B
with open('./School B.csv', 'rb') as f:
    result = chardet.detect(f.read())
b = pd.read_csv('./School B.csv', encoding=result['encoding'])

print("School B analysis")
control, treatment = split(b)
printAnalysis(control)
print("======")
printAnalysis(treatment)

print("Students in control group: ", control.shape[0])
print("Students in treatment group: ", treatment.shape[0])

School B analysis
Family size more than 3:  0.61
Receiving tuition:  0.57
Mothers with degree:  0.3
In a romantic relationship:  0.39
Family size more than 3:  0.65
Receiving tuition:  0.3
Mothers with degree:  0.17
In a romantic relationship:  0.43
Students in control group:  23
Students in treatment group:  23
