In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
from matplotlib.gridspec import GridSpec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go

# ⬇️ Downloading data

In [None]:
path = '../input/students-performance-in-exams/StudentsPerformance.csv'
data = pd.read_csv(path)
data.head()

# 📊 Graphs

In [None]:
fig = plt.figure(figsize=(25,7))
grid = GridSpec(nrows=1, ncols=3, figure=fig)
fig.suptitle("Test score distribution by gender", fontsize=20)

axis1 = fig.add_subplot(grid[0,0])
sns.histplot(data=data, x='math score', hue='gender')

axis2 = fig.add_subplot(grid[0,1])
sns.histplot(data=data, x='writing score', hue='gender')

axis3 = fig.add_subplot(grid[0,2])
sns.histplot(data=data, x='reading score', hue='gender')

plt.show()

In [None]:
fig = plt.figure(figsize=(25,7))
grid = GridSpec(nrows=1, ncols=3, figure=fig)
fig.suptitle("Test score distribution by race/ethnicity", fontsize=20)

axis4 = fig.add_subplot(grid[0,0])
sns.histplot(data=data, x='math score', hue='race/ethnicity')

axis5 = fig.add_subplot(grid[0,1])
sns.histplot(data=data, x='writing score', hue='race/ethnicity')

axis6 = fig.add_subplot(grid[0,2])
sns.histplot(data=data, x='reading score', hue='race/ethnicity')

plt.show()

In [None]:
fig = plt.figure(figsize=(25,7))
grid = GridSpec(nrows=1, ncols=3, figure=fig)
fig.suptitle("Test score distribution by Level of education", fontsize=20)

axis7 = fig.add_subplot(grid[0,0])
sns.histplot(data=data, x='math score', hue='parental level of education')

axis9 = fig.add_subplot(grid[0,1])
sns.histplot(data=data, x='writing score', hue='parental level of education')

axis9 = fig.add_subplot(grid[0,2])
sns.histplot(data=data, x='reading score', hue='parental level of education')

plt.show()

In [None]:
fig = plt.figure(figsize=(25,7))
grid = GridSpec(nrows=1, ncols=3, figure=fig)
fig.suptitle("Test score distribution by Lunch type", fontsize=20)

axis10 = fig.add_subplot(grid[0,0])
sns.histplot(data=data, x='math score', hue='lunch')

axis11 = fig.add_subplot(grid[0,1])
sns.histplot(data=data, x='writing score', hue='lunch')

axis12 = fig.add_subplot(grid[0,2])
sns.histplot(data=data, x='reading score', hue='lunch')

plt.show()

In [None]:
fig = plt.figure(figsize=(25,7))
grid = GridSpec(nrows=1, ncols=3, figure=fig)
fig.suptitle("Test score distribution by Preparation Course", fontsize=20)

axis13 = fig.add_subplot(grid[0,0])
sns.histplot(data=data, x='math score', hue='test preparation course')

axis10 = fig.add_subplot(grid[0,1])
sns.histplot(data=data, x='writing score', hue='test preparation course')

axis15 = fig.add_subplot(grid[0,2])
sns.histplot(data=data, x='reading score', hue='test preparation course')

plt.show()

# ✖️ Variables

## Defining variables


In [None]:
none = data[data['test preparation course'] == 'none']
completed = data[data['test preparation course'] == 'completed']

standard = data[data['lunch'] == 'standard']
free = data[data['lunch'] == 'free/reduced']

male = data[data['gender'] == 'male']
female = data[data['gender'] == 'female']

## Levene Test and T-Test
* Levene: The Levene test tests the null hypothesis that all input samples are from populations with equal variances. Levene’s test is an alternative to Bartlett’s test bartlett in the case where there are significant deviations from normality.
</br>
</br>
* T-Test: 
  This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. This test assumes that the populations have identical variances by default.
</br>
</br>
</br>
</br>

$H_0$: The two samples have identical average  ($p \geq 0.05$)
</br>
</br>

$H_a$: The two samples do not have identical average ($p < 0.05$)

### Course


In [None]:
stats_math_levene_course, p_value_math_levene_course = stats.levene(none['math score'], completed['math score'])

stats_reading_levene_course, p_value_reading_levene_course = stats.levene(none['reading score'], completed['reading score'])

stats_writing_levene_course, p_value_writing_levene_course = stats.levene(none['writing score'], completed['writing score'])

print("Math Score -> P-Value:", p_value_math_levene_course)

print("\nReading Score -> P-Value:", p_value_reading_levene_course)

print("\nWriting Score -> P-Value:", p_value_writing_levene_course)

In [None]:
stats_math_course, p_value_math_course = stats.ttest_ind(a=none['math score'], b=completed['math score'], equal_var=True)

stats_reading_course, p_value_reading_course  = stats.ttest_ind(a=none['reading score'], b=completed['reading score'], equal_var=True)

stats_writing_course, p_value_writing_course  = stats.ttest_ind(a=none['writing score'], b=completed['writing score'], equal_var=False)

print("\nMath Score -> P-Value:", p_value_math_course)

print("\nReading Score -> P-Value:", p_value_reading_course)

print("\nWriting Score -> P-Value:", p_value_writing_course)


### Lunch

In [None]:
stats_math_levene_lunch, p_value_math_levene_lunch = stats.levene(standard['math score'], free['math score'])

stats_reading_levene_lunch, p_value_reading_levene_lunch = stats.levene(standard['reading score'], free['reading score'])

stats_writing_levene_lunch, p_value_writing_levene_lunch = stats.levene(standard['writing score'], free['writing score'])

print("Math Score -> P-Value:", p_value_math_levene_lunch)

print("\nReading Score -> P-Value:", p_value_reading_levene_lunch)

print("\nWriting Score -> P-Value:", p_value_writing_levene_lunch)

In [None]:
stats_math_lunch, p_value_math_lunch = stats.ttest_ind(a=standard['math score'], b=free['math score'], equal_var=True)

stats_reading_lunch, p_value_reading_lunch  = stats.ttest_ind(a=standard['reading score'], b=free['reading score'], equal_var=True)

stats_writing_lunch, p_value_writing_lunch  = stats.ttest_ind(a=standard['writing score'], b=free['writing score'], equal_var=True)

print("Math Score -> P-Value:", p_value_math_lunch)

print("\nReading Score -> P-Value:", p_value_reading_lunch)

print("\nWriting Score -> P-Value:", p_value_writing_lunch)

### Gender

In [None]:
stats_math_levene_gender, p_value_math_levene_gender = stats.levene(male['math score'], female['math score'])

stats_reading_levene_gender, p_value_reading_levene_gender = stats.levene(male['reading score'], female['reading score'])

stats_writing_levene_gender, p_value_writing_levene_gender = stats.levene(male['writing score'], female['writing score'])

print("Math Score -> P-Value:", p_value_math_levene_gender)

print("\nReading Score -> P-Value:", p_value_reading_levene_gender)

print("\nWriting Score -> P-Value:", p_value_writing_levene_gender)

In [None]:
stats_math_gender, p_value_math_gender = stats.ttest_ind(a=male['math score'], b=female['math score'], equal_var=True)

stats_reading_gender, p_value_reading_gender  = stats.ttest_ind(a=male['reading score'], b=female['reading score'], equal_var=True)

stats_writing_gender, p_value_writing_gender  = stats.ttest_ind(a=male['writing score'], b=female['writing score'], equal_var=True)

print("Math Score -> P-Value:", p_value_math_gender)

print("\nReading Score -> P-Value:", p_value_reading_gender)

print("\nWriting Score -> P-Value:", p_value_writing_gender)

## Comparison

In [None]:


fig = go.Figure(data=[go.Table(
    header=dict(values=['Variable', 'Levene Test Math Score','T-Test Math Score','Levene Test Reading Score','T-Test Reading Score', 'Levene Test Writing Score','T-Test Writing Score'],
                line_color='black',
                fill_color='cornflowerblue',
                align='center',
                font=dict(color='black', size=14)),
    cells=dict(values=[['Gender', 'Lunch', 'Course'], # 1st column
                       [round(p_value_math_levene_gender, 4), round(p_value_math_levene_lunch, 4), round(p_value_math_levene_course, 4)],
                       [p_value_math_gender, p_value_math_lunch, p_value_math_course], 
                       [round(p_value_reading_levene_gender, 4), round(p_value_reading_levene_lunch, 4), round(p_value_reading_levene_course, 4)],
                       [p_value_reading_gender, p_value_reading_lunch, p_value_reading_course],
                       [round(p_value_writing_levene_gender, 4),round(p_value_writing_levene_lunch, 4), round(p_value_writing_levene_course, 4)],
                       [p_value_writing_gender, p_value_writing_lunch, p_value_writing_course]], # 2nd column
               line_color='black',
               fill_color='lightskyblue',
               align='center'))
])

fig.update_layout(width=1750, height=300)
fig.show()

# 💾 Data preparation

## One Hot Encoding

In [None]:
genderTypes = {'gender': {'female': 0, 'male': 1}}

courseTypes = {'test preparation course': {'none': 0, 'completed': 1}}

lunchTypes = {'lunch': {'free/reduced': 0, 'standard': 1}}

data = data.replace(genderTypes)

data = data.replace(courseTypes)

data = data.replace(lunchTypes)

In [None]:
data = pd.get_dummies(data, columns=['race/ethnicity', 'parental level of education'], prefix=['race', 'education'])

In [None]:
data.drop(['race_group E', 'education_some high school'], axis=1, inplace=True)

In [None]:
data.head()

## Test split

In [None]:
x_with_all_variables = data.drop('math score', axis=1)
y_with_all_variables = data['math score']

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(x_with_all_variables, y_with_all_variables, random_state=40)

x_using_less_variables = data[['gender', 'lunch', 'test preparation course', 'reading score']]
y_using_less_variables = data['math score']

X_train, X_test, y_train, y_test = train_test_split(x_using_less_variables, y_using_less_variables, random_state=40)

# 📁 Model Section

In [None]:
model_all_variables = LinearRegression()
model_all_variables.fit(X_train_all, y_train_all)
pred_all_variables = model_all_variables.predict(X_test_all)

model_less_variables = LinearRegression()
model_less_variables.fit(X_train, y_train)
pred_less_variables = model_less_variables.predict(X_test)


## Prediction

In [None]:
pd.DataFrame({"Real": y_test, 'Predict With All Variables': pred_all_variables, 'Predict With Less Variables': pred_less_variables})

We can confirm that we dont need to use 14 columns to predict