<h1>Students Performance Analysis</h1>

<h3>General imports</h3>

In [None]:
#Manipulation data
import pandas as pd
import numpy as np

#Visualiation data
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns 

#Theme
sns.set_theme(style='whitegrid', palette=["#003f5c", "#444e86", "#955196", "#dd5182", "#ff6e54", "#ffa600"], font_scale=1.25)

# Preprocessing
from sklearn.model_selection import train_test_split

# Metrics 
from sklearn.metrics import mean_squared_error

# ML Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso

<h3>Loading data</h3>

In [None]:
df = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
numerical_features = [ 'math score', 'reading score', 'writing score']

In [None]:
df.isnull().sum()

<p>--> There are 8 columns (5 categorical and 3 numerical) for 1000 rows.<br>
   --> There are not null values. </p> 

<h3>Exploratory data analysis</h3>

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(30, 8))

i = 0
autopct='%1.1f%%'
for feature in categorical_features:
    x = df[feature].value_counts()
    labels = df[feature].value_counts().index
    ax[i].pie(x=x, labels=labels, autopct=autopct)
    ax[i].set_title(feature.capitalize(), fontdict={'fontsize': 20, 'fontweight' : 2})
    i += 1

fig.suptitle("Students Analisys", fontsize=30)
plt.show()

<p>--> There is about the same number of male and female studentes.<br>
   --> The group C is the most common race/ethnicity and the group A is the least one.<br>
   --> The most common parental level of education are "some college" and "associate's degree, while "master's degree" is the least common.<br>
   --> The majority of students has standard lunch.<br>
   --> The majority of students hasn't completed the test preparation course</p>

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True)
plt.title("Correlation between scores", fontdict={'fontsize': 20, 'fontweight' : 2})
plt.show()

<p>--> The correlation between the scores is quite high, particularly between writing and reading.</p>

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(25, 6))

i = 0
for score in numerical_features:
    sns.histplot(df[score], ax=ax[i])
    i += 1

plt.suptitle("Scores distributions", fontsize=20)
plt.show()

In [None]:
fig, ax = plt.subplots(5, 3, figsize=(25, 25))

m = 0
for feature in categorical_features:
    n = 0
    for score in numerical_features:
        sns.barplot(data=df, x=feature, y=score, ax=ax[m][n])
        n += 1
    m += 1

fig.suptitle("Students Grades by feature", fontsize=35)
plt.show()

<p>--> Female students have better grades in reading and writing while male students gìhave better grades in math.<br>
   --> Students of Group E score the most on each skill, while students of group A have the worst results.<br>
   --> Students with "master's degree" parents level of education have the highest scores.<br>
   --> Students with standard lunch do better than students with free/reduced lunch.<br>
   --> Students who completed the test preparation course do better than students who didn't.</p>

<h3>Processing data</h3>

In [None]:
df = pd.get_dummies(data = df, columns = categorical_features)

In [None]:
X = df.drop(numerical_features, axis=1)
y = df[numerical_features]

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=0)

<h3>ML Models</h3>

In [None]:
regressor_list = [RandomForestRegressor, LinearRegression, Ridge, ElasticNet, Lasso]

for regressor in regressor_list:
    reg = regressor().fit(train_X, train_y)
    predicted_train = reg.predict(train_X)
    predicted_test = reg.predict(test_X)
    mse_train = mean_squared_error(train_y, predicted_train)
    mse_test = mean_squared_error(test_y, predicted_test)
    print(f"{regressor} \n mse on training set: {mse_train} \n mse on testing set: {mse_test}")