# 0.Stablishing the goal
<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:45px; padding:10px; border:8px solid black;'><center><b>🏁 Goal</b></center></h1>
</html>

In this notebook, we will explore a dataset(https://www.kaggle.com/spscientist/students-performance-in-exams), which contains 1000 entries of students.  
The main goal will be to stablish a **gender prediction** with ML based on the available features and conclude about the most important features in this prediction.

# 1.Importing
<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:45px; padding:10px; border:8px solid black;'><center><b>📚 Importing Libraries and Data</b></center></h1>
</html>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt
import seaborn as sns

# turn off warnings for final notebook
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
%matplotlib inline
sns.set_context('notebook')
sns.set_palette('Set2')
sns.set_style('darkgrid')

In [None]:
#Importing the dataset
df=pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')
df.head()

# 2.Exploring and Preparing
<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:45px; padding:10px; border:8px solid black;'><center><b>🔎 Exploring and Preparing</b></center></h1>
</html>

In [None]:
# general information about the dataset
df.info()
df.isnull().sum().sort_values(ascending=False)

*No apparent absent data.  
*The datatypes seem to be correct.

<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:30px; padding:10px; border:8px solid black;'><center><b>Numerical Variables</b></center></h1>
</html>

In [None]:
num = df.select_dtypes(include=np.number)
num.columns

In [None]:
fig,ax = plt.subplots(figsize=(10,5))

bplot=ax.boxplot(num,
                 patch_artist=True,
                labels=num.columns)

colors=['blue','red','green']
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)

At the boxplot, we see some outliers, but nothing unexpected considering a school exam.

In [None]:
# Frequency distribution
fig, axs = plt.subplots(nrows=1,ncols=3, figsize=(20,5), sharey=True, tight_layout=True)

bin_num=30
colors=['blue','red','green']
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

for i in range(0,3):
    n, bins, patches = axs[i].hist(num[num.columns[i]],bins=bin_num,color=colors[i])
    axs[i].set_title(num.columns[i], size=20,fontweight='bold')
    axs[i].axvline(x=num[num.columns[i]].mean())
    mu = num[num.columns[i]].mean()
    sigma = num[num.columns[i]].std()
    textstr = '\n'.join((
        r'$\mu=%.2f$' % (mu, ),
        r'$\sigma=%.2f$' % (sigma, )))
    axs[i].text(0.82*i, 0.95, textstr, transform=ax.transAxes, fontsize=14,
        verticalalignment='top', bbox=props)
    
axs[0].set_ylabel('Frequency', size=20,fontweight='bold') 

The frequency distribution, also doesn't show anything unexpected, but there seems to be some bias towards the score of 70 in all exams.

In [None]:
num.describe()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
ax.set_title("Correlation Matrix\n", size=20,fontweight='bold')
sns.heatmap(num.corr(), annot=True,ax=ax,);

Here, we see the correlation matrix of the numerical variables, namely the exams scores.  
We observe a very strong correlation between "writing score" and the "reading score".  
We also see a strong correlation between "writing score" and "math score".
It can be concluded that the performance of the stundents tend to be rather linear, if they have higher( or lower) score in one domain, it is expected the other domains to also be higher(or lower) and vice versa.

In [None]:
plt.subplots(figsize=(7, 7))
sns.kdeplot(data=num, x="reading score", y="writing score", levels=50, color="b",thresh=0,cmap="rocket",fill=True)
sns.kdeplot(data=num, x="math score", y="writing score", levels=50, color="b",thresh=0,cmap="rocket",fill=True)

In [None]:
with sns.axes_style("white"):
    g = sns.pairplot(df, diag_kind="kde",height= 4,corner=True,diag_kws={"linewidth": 0, "shade": False})
    g.map_lower(sns.kdeplot,  levels=50, color="b",thresh=0,cmap="rocket",fill=True)

In [None]:
g = sns.pairplot(df, diag_kind="kde", height=4,hue='gender',corner=False)
g.map_lower(sns.kdeplot,  levels=50,hue=None,thresh=0,cmap="rocket",fill=True)

g.fig.text(0.33, 1.02,'Distribution of Test Scores by Gender', fontsize=20)

Here we see the distribution of the scores, now also in respect to the gender of the students.
* There is overlap among the distributions, but we can see that the female students have a better mean performance in writing and reading, the male students in the other hand, have a slight edge in math
* When combined, the writing and math scores, make distinct areas when it comes to gender. Those probably will be relevant predictors in the model.
* Due to the overlap, it is important to check for the statistical relevance of the differences, especially in regard to math scores, where the distributions seems to be somewhat similar.

In [None]:
from scipy.stats import ttest_ind
'''
Fail to Reject H0: Sample distributions are equal.
Reject H0: Sample distributions are not equal.
'''
stat, p = ttest_ind(df['math score'][df['gender']=='female'], df['math score'][df['gender']=='male'])
print('Statistics=%.3f, p=%.3f \n' % (stat, p))
# interpret
alpha = 0.05
print("Comparison between the math score of male and female students: ")
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')

<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:30px; padding:10px; border:8px solid black;'><center><b>Categorical Variables</b></center></h1>
</html>

In [None]:
cat = df.select_dtypes(exclude=np.number)
print("List of categorical variables:")
cat.columns

In [None]:
def label_function(val):
    return f'{val / 100 * len(df):.0f}\n{val:.0f}%'

fig, axs = plt.subplots(3,2, figsize=(15, 15))
for i in range(0,5):
    cat.groupby(cat.columns[i]).size().plot(kind='pie', autopct=label_function, textprops={'fontsize': 13}, cmap='tab20c', ax=axs[int((i-i%2)/2),i%2])
    axs[int((i-i%2)/2),i%2].set_title(cat.columns[i], size=20,fontweight='bold')
    axs[int((i-i%2)/2),i%2].set_ylabel(None)

plt.axis('off')

plt.tight_layout()
plt.show()

Here we see the distribution of the students among the categorical variables. No anomalities perceived.

<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:30px; padding:10px; border:8px solid black;'><center><b>Categorical X Numerical Variables</b></center></h1>
</html>

The following plot allow us to have a bird-eye view of the distribution of the numerical variables along the categorical variables. The main point is to detect anomalies or behaviours of interest.

In [None]:
fig,ax = plt.subplots(5,3,figsize=(30, 20))
for i in range(0,5):
    for j in range(0,3):
        sns.violinplot(ax=ax[i,j], x=cat[cat.columns[i]], y=num[num.columns[j]], data=df)

After the general view, we can inspect particulary interesting plots with the following graph.
Let's take for instance the writing scores and the parental level of education:

In [None]:
fig,ax = plt.subplots(figsize=(15, 10))
edu_order=order= [ 
        'some high school',
        'high school',
        'some college',
        "associate's degree",
        "bachelor's degree",  
        "master's degree"]
ax = sns.violinplot(x="parental level of education", y="writing score", data=df, inner=None, order= edu_order)
ax = sns.swarmplot(x="parental level of education", y="writing score", data=df,
                   color="white", edgecolor="gray",order= edu_order)
t = ax.text(
    0.2, 2, "Small improvement in the writing scores along parental education", ha="left", va="center", rotation=0, size=13,
    bbox=dict(boxstyle="rarrow,pad=0.3", fc="#CBC3E3", ec="#301934", lw=1))

Here we can see a small but clear improvement in the writing scores as the parental level of education increases.

In [None]:
g = sns.FacetGrid(df, col="race/ethnicity", hue="gender")
g.map(sns.histplot, "math score", alpha=0.5)
g.add_legend()

# 3.Processing
<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:45px; padding:10px; border:8px solid black;'><center><b>🛠 Processing</b></center></h1>
</html>

In [None]:
# We are going to process data
#First we have to give different treatment to three classes: ordinal Categorical features, non-ordinal Categorical features and Numerical features.

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

cat = df.select_dtypes(exclude=np.number)
num = df.drop(columns=cat.columns) # Numerical features
print("Number of unique values per categorical feature:\n", cat.nunique())

cat_ord = cat[['parental level of education','gender']] # ordinal categorical features
cat.drop(columns=['parental level of education','gender'],inplace=True) #non-ordinal categorical features


#Encoding ordinal categorical features

# define order
order_1 = [ 
        'some high school',
        'high school',
        'some college',
        "associate's degree",
        "bachelor's degree",  
        "master's degree"]
order_2 =['male','female']

# define ordinal encoding
encoder = OrdinalEncoder(categories=[order_1,order_2])
# transform data
encoder.fit(cat_ord[['parental level of education','gender']])
cat_ord_encoded = pd.DataFrame(encoder.transform(cat_ord[['parental level of education','gender']]))

cat_ord_encoded.columns = ['parental level of education','gender']


#Encoding  non-ordinal categorical features

enc = OneHotEncoder(sparse=False).fit(cat)
cat_encoded = pd.DataFrame(enc.transform(cat))
cat_encoded.columns = enc.get_feature_names(cat.columns)

# Numerical features will be standardized
from sklearn.preprocessing import StandardScaler
num.iloc[:, 0:3] = StandardScaler().fit_transform(num.iloc[:, 0:3])

In [None]:
# merge numeric and categorical data
df2 = pd.concat([cat_encoded,cat_ord_encoded, num], axis=1)
df2.head()

Now the dataset is ready to be inserted in a model

In [None]:
from sklearn.model_selection import train_test_split

X = df2.drop(columns='gender')
y = df2['gender']


x_train , x_test , y_train, y_test = train_test_split(X,y,test_size = 0.2 , random_state = 23)

Here, we split the dataset so we can test it for accuracy after modelling.

# 4.Prediction
<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:45px; padding:10px; border:8px solid black;'><center><b>🔮 Prediction</b></center></h1>
</html>

## 4.1 Random Forest

In [None]:
score_list = []
from sklearn.ensemble import RandomForestClassifier
for each in range (1,100):
    rf = RandomForestClassifier(n_estimators = each,random_state = 7,bootstrap = "False",criterion="gini",
                                min_samples_split = 10 , min_samples_leaf = 1)
    rf.fit(x_train,y_train)
    score_list.append(rf.score(x_test,y_test))
    
rf_max = np.max(score_list)
print("RF Max Score : ",rf_max)

In [None]:
plt.subplots(figsize=(7, 7))
plt.plot(score_list)
plt.title("Accuracy x estimators\n", size=20,fontweight='bold')

**We can expect the random forest Classifier to have an accuracy around 85%.**

In [None]:
#Training with the best number of estimators

best = score_list.index(max(score_list)) + 1

rf = RandomForestClassifier(n_estimators = best,random_state = 7,bootstrap = "False",criterion="gini",
                            min_samples_split = 10 , min_samples_leaf = 1)
rf.fit(x_train,y_train)


from sklearn.metrics import plot_confusion_matrix

with sns.axes_style("white"):
    titles_options = [("Confusion matrix, without normalization", None),
                      ("Normalized confusion matrix", 'true')]
    class_names = ['Male','Female']
    for title, normalize in titles_options:
        fig, ax = plt.subplots(figsize=(7, 7))
        disp = plot_confusion_matrix(rf, x_test, y_test,
                                     display_labels=class_names,
                                     cmap='rocket',
                                     normalize=normalize,
                                    ax=ax)
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)
    
plt.show()

The confusion matrix show us that given the data available, it is easier to predict female students(90% accuracy). Male students incur in more false labelling as females(17%).

## 4.2 Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm1 = SVC(gamma = 0.01 , C = 500 , kernel = "rbf")
svm1.fit(x_train,y_train)
svm1_score = svm1.score(x_test,y_test)
print("SVM Max Score = : ", svm1_score)

In [None]:
with sns.axes_style("white"):
    titles_options = [("Confusion matrix, without normalization", None),
                      ("Normalized confusion matrix", 'true')]
    class_names = ['Male','Female']
    for title, normalize in titles_options:
        fig, ax = plt.subplots(figsize=(7, 7))
        disp = plot_confusion_matrix(svm1, x_test, y_test,
                                     display_labels=class_names,
                                     cmap='rocket',
                                     normalize=normalize,
                                    ax=ax)
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)
    
plt.show()

**With the support vector machine model, we can expect a accuracy of 89,5%.**

# 5.Relative Relevance of features
<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:45px; padding:10px; border:8px solid black;'><center><b>🥇 Most relevant features</b></center></h1>
</html>

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(rf, random_state=1).fit(x_test, y_test)
eli5.show_weights(perm, feature_names = x_test.columns.tolist(), top=7)

To establish the relative relevance of the features in predicting the gender of the student, we are going to model a Random Forest Regressor, so that the target variable become continuous. Afterward we are going to apply the **SHAP (SHapley Additive exPlanations)** to rank order the features. 

In [None]:
from sklearn.ensemble import RandomForestRegressor
best = score_list.index(max(score_list)) + 1

rf_reg = RandomForestRegressor(n_estimators = best,random_state = 7,bootstrap = "False",criterion="mse",
                            min_samples_split = 10 , min_samples_leaf = 1)
rf_reg.fit(x_train,y_train)

In [None]:
import shap

# calculate shap values 
ex = shap.Explainer(rf_reg)
shap_val = ex(x_test)



In [None]:
shap.plots.bar(shap_val, show=False)
plt.title('Mean SHAP value per feature\n Gender Analysis',size=20,fontweight='bold')

Here we see that the most important variables when predicting the target feature, namely "gender",are the writing, math and reading scores.  
All other variables are dispensable.

In [None]:
# plot

plt.title('SHAP summary for Gender prediction', size=20)
shap.plots.beeswarm(shap_val, max_display=5,show=False)
fig = plt.gcf()
fig.set_figheight(7)
fig.set_figwidth(12)
ax = plt.gca()
ax.set_xlabel(r'Average SHAP values', fontsize=16)
ax.set_ylabel('Parameters', fontsize=16)
leg = ax.legend()
t = ax.text(
    0.05, -0.6, "Predict female", ha="left", va="center", rotation=0, size=13,
    bbox=dict(boxstyle="rarrow,pad=0.3", fc="#CBC3E3", ec="#301934", lw=1))
t = ax.text(
    -0.05, -0.6, "Predict male", ha="right", va="center", rotation=0, size=13,
    bbox=dict(boxstyle="larrow,pad=0.3", fc="#CBC3E3", ec="#301934", lw=1))
plt.show()

This beeswarp graph shows the dispersion of the SHAP values along the variables. As expected the three test scores contribute with higher absolute values, giving more certainty to the model prediction.

In [None]:
shap.initjs()
shap.plots.force(shap_val[10])

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

print(color.BOLD + "Individual number 280\n" + color.END)
print(x_test.iloc[10])
print(color.BOLD + y_test.map({1:'female',0:"male"}).iloc[10] + color.END)

Here we apply the SHAP waterfall to a single individual. This allow us to follow the path the model takes and helps to understand why the top three variables are determinant in the assertion

In [None]:
explainer = shap.TreeExplainer(rf_reg)
shap_values = explainer.shap_values(x_test)
shap.dependence_plot("writing score", shap_values, x_test,show=False)
fig = plt.gcf()
fig.set_figheight(7)
fig.set_figwidth(10)

<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:45px; padding:10px; border:8px solid black;'><center><b>😉 Upvote if you liked the content.</b></center></h1>
</html>

## BONUS: This method can also be applied to predict numerical values
<html>
    <h1 style='background: #D3D3D3;  color:black; font-size:45px; padding:10px; border:8px solid black;'><center><b>Math Score Prediction</b></center></h1>
</html>

In [None]:
X = df2.drop(columns='math score')
y = df2['math score']


x_train , x_test , y_train, y_test = train_test_split(X,y,test_size = 0.2 , random_state = 23)

In [None]:
from sklearn.ensemble import RandomForestRegressor
best = score_list.index(max(score_list)) + 1

rf_reg = RandomForestRegressor(n_estimators = best,random_state = 7,bootstrap = "False",criterion="mse",
                            min_samples_split = 10 , min_samples_leaf = 1)
rf_reg.fit(x_train,y_train)

In [None]:
score_list = []
for each in range (1,100):
    rf = RandomForestRegressor(n_estimators = each,random_state = 7,bootstrap = "False",criterion="mse",
                                min_samples_split = 10 , min_samples_leaf = 1)
    rf.fit(x_train,y_train)
    score_list.append(rf.score(x_test,y_test))
    
rf_max = np.max(score_list)
print("RF Max Score : ",rf_max)

In [None]:
plt.subplots(figsize=(7, 7))
plt.plot(score_list)

In [None]:
best = score_list.index(max(score_list)) + 1

rf = RandomForestRegressor(n_estimators = best, random_state = 7 ,criterion="mse",
                            min_samples_split = 10 , min_samples_leaf = 1)
rf.fit(x_train,y_train)

In [None]:
import shap

# calculate shap values 
ex = shap.Explainer(rf)
shap_val = ex(x_test)

# plot

plt.title('SHAP summary for math score prediction', size=16)
shap.plots.beeswarm(shap_val, max_display=5,show=False)
fig = plt.gcf()
fig.set_figheight(7)
fig.set_figwidth(12)
ax = plt.gca()
ax.set_xlabel(r'Average SHAP values', fontsize=16)
ax.set_ylabel('Parameters', fontsize=16)
leg = ax.legend()
t = ax.text(
    0.05, -0.6, "Predict higher math score", ha="left", va="center", rotation=0, size=13,
    bbox=dict(boxstyle="rarrow,pad=0.3", fc="#CBC3E3", ec="#301934", lw=1))
t = ax.text(
    -0.05, -0.6, "Predict lower math score", ha="right", va="center", rotation=0, size=13,
    bbox=dict(boxstyle="larrow,pad=0.3", fc="#CBC3E3", ec="#301934", lw=1))
plt.show()

In [None]:
select = range(5)
features = x_test.iloc[select]
features_display = x_test.loc[features.index]
shap.decision_plot(ex.expected_value,ex.shap_values(features)[0:5] , features_display) 

In [None]:
shap.plots.force(shap_val[2],figsize=(10,5))

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

print(color.BOLD + "Individual number 167\n" + color.END)
print(rf.predict(x_test[16:17]))
print(  df['math score'].describe().loc['mean']   + (rf.predict(x_test[10:11]))[0]*df['math score'].describe().loc['std'])


print(x_test.iloc[16])
print(color.BOLD + str(df['math score'].loc[705]) + color.END)
print(color.BOLD + str(df['math score'].describe().loc['mean'] +y_test.loc[705]*df['math score'].describe().loc['std']) + color.END)