In [None]:
# Import all libraries to be used
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score, cross_validate, validation_curve, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, plot_roc_curve


### Reading Data

In [None]:
#data = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
data = pd.read_csv('/kaggle/input/exams6k/exams.csv')
data

In [None]:
#import os

#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

data.describe()

In [None]:
# NOTE: # dev_null is used to produce no unnecessary output. Also it will not be used anywhere because it is just a variable to put anything meaningless. like /dev/null in linux
dev_null = sns.distplot(data["math score"])
dev_null.set(xlabel="Math Score", ylabel="Frequency")
dev_null = dev_null.set_title("Math Scores Distributions")

As we can see in this graph, there is a normal distribution skewed towards ~63% for math test scores.

In [None]:
dev_null = sns.distplot(data["reading score"])
dev_null.set(xlabel="Reading Score", ylabel="Frequency")
dev_null = dev_null.set_title("Reading Scores Distributions")

Reading scores are more skewed towards 70%. 

In [None]:
dev_null = sns.distplot(data["writing score"])
dev_null.set(xlabel="Writing Score", ylabel="Frequency")
dev_null = dev_null.set_title("Writing Scores Distributions")

This yields similar results as the reading exams, but seems to be more skewed towards the median at 72%.

Check data for empty values

In [None]:
data.select_dtypes('object').nunique()

Compiling number of categories in the data.

In [None]:
data.isnull().sum()

Our data seems to be clean of missing values. 👍

### Math test scores for students by group. Male and Female labeled.
As we can see members of group E seem to do the best in the test. Males seem to do slightly better on these tests. 

In [None]:
dev_null = sns.barplot(x="race/ethnicity", y="math score", hue="gender", data=data)
dev_null.set(xlabel="Group", ylabel="Math Score")
dev_null = dev_null.set_title("Math Scores By Group")

### Reading test scores for students by group. Male and Female labeled.
Again members on the group E did best, followed by group D. In this case however females seem to do better. 

In [None]:
dev_null = sns.barplot(x="race/ethnicity", y="reading score", hue="gender", data=data)
dev_null.set(xlabel="Reading Score", ylabel="Frequency")
dev_null = dev_null.set_title("Reading Scores Distributions")

### Writing test scores for students by group. Male and Female labeled.
Again members on the group E did best, followed by group D. In this case however females seem to do better. Overall scores seem to be sligthly lower than reading tests

In [None]:
%matplotlib inline

plt.figure(figsize=(25,6))
plt.subplot(1, 3, 1)
sns.distplot(data['math score'])

plt.subplot(1, 3, 2)
sns.distplot(data['reading score'])

plt.subplot(1, 3, 3)
sns.distplot(data['writing score'])

plt.suptitle('Checking for Skewness', fontsize = 15)
plt.show()

Overall it doesn't seem that fail or passing scores in any of the tests can be completely predicted from any of the categories grouped above. Unless there is some variation. Let's see if we can catch that using boxplot:

In [None]:
dev_null = sns.heatmap(data.corr(), annot=True, fmt=".2f")
dev_null = dev_null.set_title("Frequency Distributions comparing scores")

In [None]:
data.info()

In [None]:
dev_null = sns.countplot(x="race/ethnicity", data=data)
dev_null.set(xlabel="Group", ylabel="Count")
dev_null = dev_null.set_title("Count of Group members")

In [None]:
dev_null = sns.barplot(x="race/ethnicity", y="writing score", hue="gender", data=data)
dev_null.set(xlabel="Group", ylabel="Writing Score")
dev_null = dev_null.set_title("Writing Scores by Group")

In [None]:
countplot = sns.countplot(x="parental level of education", data=data)
countplot.set_xticklabels(countplot.get_xticklabels(), rotation=40, ha="right")
countplot.set(xlabel="Parental Education Lvl", ylabel="Count")
dev_null = countplot.set_title("Count of Students by Education attained by their parents")

In [None]:
dev_null = sns.boxplot(x="race/ethnicity", y="math score", hue="gender", data=data)
dev_null.set(xlabel="Group", ylabel="Math Score")
dev_null = dev_null.set_title("Math Scores by Group")

As we can see there is some variation among all the groups, and even some outliers. The goal then, is to find what causes these variations with respect to the other fields in the data. 

Those are the categories that we should put emphasis on for further analysis and modeling. 

What if we check the relationship between race/ethnicity and parental education level. Will this answer the question of what causes group E to do better than any other group.

In [None]:
# check counts for relationship between race/ethnicity and parental education level. 
parent_edu_vs_eth_race = pd.crosstab(index=data["race/ethnicity"], columns=data["parental level of education"])
dev_null = sns.heatmap(parent_edu_vs_eth_race)
dev_null.set(xlabel="Parent Education Lvl", ylabel="Group")
dev_null = dev_null.set_title("Correlation between Groups and Parent Educ. Lvl")

The heatmap above illustrates the relationship between the groups in the data vs parental education.

In [None]:
parent_edu_vs_eth_race

In [None]:
bar = sns.barplot(x="parental level of education", y="math score", hue="race/ethnicity", data=data)
bar.set_xticklabels(bar.get_xticklabels(), rotation=40, ha="right")
bar.set(xlabel="Parental Educ. Lvl", ylabel="Math Score")
dev_null = bar.set_title("Relating Parental Educ. Lvl to Math Score")

It seems, even if by a small degree, that parental level of education has an impact on the student core, improving it slightly when parents of the student attain a higher level of education.

In [None]:
bar = sns.boxplot(x="parental level of education", y="math score", hue="race/ethnicity", data=data)
bar.set_xticklabels(bar.get_xticklabels(), rotation=40, ha="right")
bar.set(xlabel="Parent Educ. Lvl", ylabel="Math Score")
dev_null = bar.set_title("Math Score and Parent Educ. Lvl")

A boxplot graph can help see more clearly the distribution of students of different group.

# MLP Model To Predict P/F of Students

Preparing data

In [None]:
data["Pass"] = data.apply(lambda x: 1 if x["math score"] >= 65 and x["reading score"] >= 65 and x["writing score"] >= 65 else 0, axis=1)
data = data.drop(["math score", "reading score", "writing score"], axis=1)
data.select_dtypes(include="object")
data

X = data.drop(["Pass"], axis=1)
y = data["Pass"]
X,y

### Implement MLP Classifier

In [None]:
# Add using different parameters.

In [None]:
# Encoding categorical inputs
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X)
X = encoder.transform(X)

# 80/20 train split ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)

mlp = MLPClassifier(
    max_iter=3000,
    hidden_layer_sizes=[17, 13, 7], 
    solver="sgd", 
    random_state=1,
    verbose=False
).fit(X_train, y_train)

y_predicted = mlp.predict(X_test)

y_predicted, y_test.to_numpy() # Todo compare 


oneHotEncoder is used to encode categiorical columns into values that can be digested by the used algorithm implementation, in our case it.

The MLP configured above will iterate 3000 times, use hidden layers and 17, 13, 7, solver stochastic gradient descent. Our data was devided into a 80/20 train/set splits to train and evaluate your classifier. 

### Compute learning curve

In [None]:
def format_scores_as_dataframe(labels, train_scores, test_scores):
    learning_data = {"labels": [], "type": [], "score": []}

    for i in range(len(train_sizes)):
        for j in range(len(train_scores)):
            learning_data["labels"].append(labels[i])
            learning_data["type"].append("train")
            learning_data["score"].append(train_scores[i][j])
            learning_data["labels"].append(labels[i])
            learning_data["type"].append("test")
            learning_data["score"].append(test_scores[i][j])
            
    return pd.DataFrame.from_dict(learning_data)

In [None]:
# Learning Curve | Complexity Curve
train_sizes, train_scores, test_scores = learning_curve(mlp, X, y)

learning_curve_df = format_scores_as_dataframe(train_sizes, train_scores, test_scores)

# train and test learning scores results
ax = sns.lineplot(x="labels", y="score", hue="type", data=learning_curve_df, marker="o", ci=None)
ax.set_title("Learning Curve for MLP Algorithm")
dev_null = ax.set(xlabel="Samples", ylabel="Error")

Learning curve is a measurement to check how well the model learns. This is measured by taking a reading of the accuracy of the algorithm as it trains and also while it is testing. This are plotting to see the convergence.

### Compute cross-validation curve

In [None]:
scores = cross_val_score(mlp, X, y)

scores, scores.mean(), scores.std()

dev_null = sns.lineplot(x=[1,2,3,4,5], y=scores)
dev_null.set_title("Cross Score Distribution")
dev_null = dev_null.set(xlabel="# of runs", ylabel="Accuracy")

The curve above shows the cross-validation scores for the default 5 runs in the cross-validation process for the MLP model.

In [None]:
cross_val_result = cross_validate(mlp, X, y, return_train_score=True)

#validation_curve(mlp, X, y, param_name="alpha", param_range=[0.0001, 0.001, 0.05])
train_scores, test_scores = validation_curve(mlp, X, y, param_name="hidden_layer_sizes", param_range=([5], [10], [10,5], [15, 10], [25,10,5]))

val_curve_data = {"labels": [], "type": [], "scores": []}
param_ranges = ["[5]", "[10]", "[10,5]", "[15,10]", "[25,10,5]"]

for i in range(len(train_scores)):
    for j in range(len(train_scores[i])):
        val_curve_data["labels"].append(param_ranges[i])
        val_curve_data["type"].append("train")
        val_curve_data["scores"].append(train_scores[i][j])
        val_curve_data["labels"].append(param_ranges[i])
        val_curve_data["type"].append("test")
        val_curve_data["scores"].append(test_scores[i][j])
        
val_curve_df = pd.DataFrame.from_dict(val_curve_data)

ax = sns.lineplot(x="labels", y="scores", hue="type", data = val_curve_df, marker="o", ci=None)
ax.set_title("Validation Curve for our MLP model")
dev_null = ax.set(xlabel="Layers/Neurons", ylabel="Accuracy Score")




Cross validation is a measure of how well our model can generalize from what it learns. How well will it perform with data it has neven seen before. This is done by saving part of the data to later predict and measure the accuracy. The training data is split with differing testing folds to be used. Default in this case is k=5 folds.

### Compute confusion matrix

In [None]:
confusion_mtrx = confusion_matrix(y_test, y_predicted)
classification_rprt = classification_report(y_test, y_predicted)
accuracy_scr = accuracy_score(y_test, y_predicted)
# TN FP
# FN TP
print("Confusion Matrix")
print(confusion_mtrx)
print("Classification Report")
print(classification_rprt)
print("Accuracy")
print(accuracy_scr)

The confusion matrix shows the frequency for True Positives, True Negatives, False Positives, and False Negative. Also a summary of the different properties can be presented here, along with the accuracy for predicted values.

### AUC curve

In [None]:
# Computing AUC score
roc = roc_auc_score(y_test, y_predicted)
dev_null = plot_roc_curve(mlp, X_test, y_test, name="AUC/ROC Curve for MLP")

The higher the area under the curve for this graph the better the model is in predicting values for a specific domain. In this graph seveal runs are made and accuracy measured.

### GridSearchCV

In [None]:
parameters = { # parameters commented to make running time shorter
    "hidden_layer_sizes": [[8], [5], [2]],#, [8,8], [8,5], [5,8], [5,2], [2,2], [8,5,2], [8,5,5], [13,8,4], [17,13,7]],
    "activation": ["identity", "logistic"],#, "tanh", "relu"], 
    "solver": ["lbfgs", "sgd"],#, "adam"], 
    "max_iter": [200, 500],#, 1000, 2000, 3000, 5000]
}

# Brace yourself, this will take a while
mlp = MLPClassifier()
gs = GridSearchCV(mlp, parameters)
gs.fit(X_train, y_train)
gs.predict(X_test)
gs.best_estimator_

Running this overnight yielded the following configuration as the best one: MLPClassifier(activation='identity', hidden_layer_sizes=[2], max_iter=3000). We will therefore run a model and analysis for this configuration as well. 

A grid search will help us determine the optimal configurations to run our models. 

## Exploring MLP with different characteristcics
Let us check the MLP using logistic as activation function.

In [None]:
X = data.drop(["Pass"], axis=1)
y = data["Pass"]
X,y

In [None]:
# Encoding categorical inputs
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X)
X = encoder.transform(X)

# 80/20 train split ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)

mlp = MLPClassifier(
    max_iter=10000,
    hidden_layer_sizes=[100], 
    activation="logistic",
    random_state=1,
    verbose=False
).fit(X_train, y_train)

y_predicted = mlp.predict(X_test)

y_predicted, y_test.to_numpy() # Todo compare 

In [None]:
# Learning Curve | Complexity Curve

train_sizes, train_scores, test_scores = learning_curve(mlp, X, y)

learning_curve_df = format_scores_as_dataframe(train_sizes, train_scores, test_scores)

# train and test learning scores results
ax = sns.lineplot(x="labels", y="score", hue="type", data=learning_curve_df, marker="o", ci=None)
ax.set_title("Learning Curve for MLP Algorithm")
dev_null = ax.set(xlabel="Samples", ylabel="Error")

In [None]:
scores = cross_val_score(mlp, X, y)

scores, scores.mean(), scores.std()

dev_null = sns.lineplot(x=[1,2,3,4,5], y=scores)
dev_null.set_title("Cross Score Distribution")
dev_null = dev_null.set(xlabel="# of runs", ylabel="Accuracy")

In [None]:
cross_val_result = cross_validate(mlp, X, y, return_train_score=True)

#validation_curve(mlp, X, y, param_name="alpha", param_range=[0.0001, 0.001, 0.05])
train_scores, test_scores = validation_curve(mlp, X, y, param_name="hidden_layer_sizes", param_range=([5], [10], [10,5], [15, 10], [25,10,5]))

val_curve_data = {"labels": [], "type": [], "scores": []}
param_ranges = ["[5]", "[10]", "[10,5]", "[15,10]", "[25,10,5]"]

for i in range(len(train_scores)):
    for j in range(len(train_scores[i])):
        val_curve_data["labels"].append(param_ranges[i])
        val_curve_data["type"].append("train")
        val_curve_data["scores"].append(train_scores[i][j])
        val_curve_data["labels"].append(param_ranges[i])
        val_curve_data["type"].append("test")
        val_curve_data["scores"].append(test_scores[i][j])
        
val_curve_df = pd.DataFrame.from_dict(val_curve_data)

ax = sns.lineplot(x="labels", y="scores", hue="type", data = val_curve_df, marker="o", ci=None)
ax.set_title("Validation Curve for our MLP model")
dev_null = ax.set(xlabel="Layers/Neurons", ylabel="Accuracy Score")

In [None]:
confusion_mtrx = confusion_matrix(y_test, y_predicted)
classification_rprt = classification_report(y_test, y_predicted)
accuracy_scr = accuracy_score(y_test, y_predicted)
# TN FP
# FN TP
print("Confusion Matrix")
print(confusion_mtrx)
print("Classification Report")
print(classification_rprt)
print("Accuracy")
print(accuracy_scr)

In [None]:
# Computing AUC score
roc = roc_auc_score(y_test, y_predicted)
dev_null = plot_roc_curve(mlp, X_test, y_test, name="AUC/ROC Curve for MLP")

### Running Grid Search suggested model.
MLPClassifier(activation='identity', hidden_layer_sizes=[2], max_iter=3000), very simplistic as you can see.

In [None]:
X = data.drop(["Pass"], axis=1)
y = data["Pass"]
X,y

In [None]:
# Encoding categorical inputs
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X)
X = encoder.transform(X)

# 80/20 train split ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)

mlp = MLPClassifier(
    max_iter=3000,
    hidden_layer_sizes=[2], 
    solver="sgd",
    activation="identity",
    random_state=1,
    verbose=False
).fit(X_train, y_train)

y_predicted = mlp.predict(X_test)

y_predicted, y_test.to_numpy() # Todo compare 

In [None]:
# Learning Curve | Complexity Curve

train_sizes, train_scores, test_scores = learning_curve(mlp, X, y)

learning_curve_df = format_scores_as_dataframe(train_sizes, train_scores, test_scores)

# train and test learning scores results
ax = sns.lineplot(x="labels", y="score", hue="type", data=learning_curve_df, marker="o", ci=None)
ax.set_title("Learning Curve for MLP Algorithm")
dev_null = ax.set(xlabel="Samples", ylabel="Error")

In [None]:
scores = cross_val_score(mlp, X, y)

scores, scores.mean(), scores.std()

dev_null = sns.lineplot(x=[1,2,3,4,5], y=scores)
dev_null.set_title("Cross Score Distribution")
dev_null = dev_null.set(xlabel="# of runs", ylabel="Accuracy")

In [None]:
cross_val_result = cross_validate(mlp, X, y, return_train_score=True)

train_scores, test_scores = validation_curve(mlp, X, y, param_name="alpha", param_range=[0.1, 5, 10])
#train_scores, test_scores = validation_curve(mlp, X, y, param_name="hidden_layer_sizes", param_range=([2], [7], [2,2], [7, 2], [10,7,2]))

val_curve_data = {"labels": [], "type": [], "scores": []}
param_ranges = ["[2]", "[7]", "[2,2]", "[7,2]", "[10,7,2]"]

for i in range(len(train_scores)):
    for j in range(len(train_scores[i])):
        val_curve_data["labels"].append(param_ranges[i])
        val_curve_data["type"].append("train")
        val_curve_data["scores"].append(train_scores[i][j])
        val_curve_data["labels"].append(param_ranges[i])
        val_curve_data["type"].append("test")
        val_curve_data["scores"].append(test_scores[i][j])
        
val_curve_df = pd.DataFrame.from_dict(val_curve_data)

ax = sns.lineplot(x="labels", y="scores", hue="type", data = val_curve_df, marker="o", ci=None)
ax.set_title("Validation Curve for our MLP model")
dev_null = ax.set(xlabel="Layers/Neurons", ylabel="Accuracy Score")

In [None]:
confusion_mtrx = confusion_matrix(y_test, y_predicted)
classification_rprt = classification_report(y_test, y_predicted)
accuracy_scr = accuracy_score(y_test, y_predicted)
# TN FP
# FN TP
print("Confusion Matrix")
print(confusion_mtrx)
print("Classification Report")
print(classification_rprt)
print("Accuracy")
print(accuracy_scr)

In [None]:
# Computing AUC score
roc = roc_auc_score(y_test, y_predicted)
dev_null = plot_roc_curve(mlp, X_test, y_test, name="AUC/ROC Curve for MLP")