# In this notebook:

## a realistic dataset: predicting academic performance
- we use Pandas dataframes to handle the training set

## - multi-class problem

## - > 2 features

## - Overfitting

## - Cross-validation

## - Ensemble methods to mitigate overfitting in decision trees: Random Forests

In [None]:
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns

from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV,
    train_test_split,
)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import (
    LinearSVC,
    SVC,
)
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv('../data/Academic/academic_performance_clean.csv', header=0)
pred_feat = 'Class'
seed = 10

## the dataset has the following attributes:

In [None]:
df.columns

In [None]:
df.head(10)

 ## Some attributes are numerical, others are categorical, as follows.

In [None]:
categorical = ['Gender', 'Nationality', 'StageID', 'GradeID', 'SectionID',
               'Topic','Semester', 'Relation', 'ParentAnsweringSurvey',
               'ParentSchoolSatisfaction', 'StudentAbsenceDays']

numerical = ['RaisedHands', 'VisitedResources',
             'AnnouncementsView', 'Discussion']

Here are the codes for the categorical attributes:

In [None]:
for v in categorical:
    print(f"{v}: {df[v].unique()}")

## we have one distinguished variable, `Class`, which is the category we will want to predict given all other variables

## for categorical variables, we can plot the counts of each value:

In [None]:
for v in categorical:
    g = sns.FacetGrid(df, col="Class")
    g.map(sns.countplot, v, order=sorted(df[v].unique()))

### Some of these deserve better plots:

In [None]:
g = sns.countplot(x="Nationality", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.show()

In [None]:
g = sns.countplot(x="Topic", data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.show()

In [None]:
g = sns.countplot(x="GradeID", data=df, order=sorted(df['GradeID'].unique()))
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.show()

## summary statistics for numerical variables:

In [None]:
df[numerical].describe()

## for numerical variables, we can plot their distribution across the dataset:

In [None]:
for v in numerical:
    g = sns.FacetGrid(df)
    g.map(sns.histplot, v)

## it is also interesting to look at the distributions of numerical variables across the two Class outcomes, possibly further aggregated, eg by Gender:

In [None]:
for v in numerical:
    g = sns.FacetGrid(df, col="Class")
    g.map(sns.barplot, "Gender", v, order=["Male", "Female"])

## we can also easily switch the type of plots to achieve different visualisations:

hint: try replacing the `kind` with one of `box`, `boxen`, `violin`, `point`, `bar`, `swarm`

In [None]:
for v in numerical:
    g = sns.catplot(x="Class", y=v, data=df, kind="box", hue='Gender')

# one-hot encoding 
### classifiers require that categorical variables be encoded in a specific way.

we are going to create *dummy variables* for each of these variables. 
we can either:

- assign a numeric value to each categorical value in a set, eg GradeID
- generates one new column for each value of a variable, see eg Nationality

In [None]:
col = ["Nationality", "SectionID", "Topic"]

for c in col:
    dummies = []
    dummies.append(pd.get_dummies(df[c]))
    df_dummies = pd.concat(dummies, axis=1)
    df = pd.concat((df, df_dummies), axis=1)
    df = df.drop([c], axis=1)



- Gender: 0 Female, 1 Male
- StageID: 0 Lower, 1 Middle, 2 High
- GradeID: 0 - 11
- Semester: 0 First, 1 Second
- Relation: 0 Mother, 1 Father
- ParentAnswering: 0 No, 1 Yes
- ParentSchool: 0 Bad, 1 Good
- Class: 0 L, 1 M, 2 H
- Topic: 
- StudentAbsence: 0 Under-7, 1 Above-7
- Nationality:  

here is the result:

In [None]:
## reload the dataset ready for processing
df = pd.read_csv('../data/Academic/academic_onehot.csv', header=0)
df.shape

In [None]:
df.columns

In [None]:
df.head()

### values for each of the variables:

In [None]:
newCategorical = [col for col in df.columns if col not in numerical]

for v in newCategorical:
    print(f"{v}: {df[v].unique()}")

## we now check that the class values are not *unbalanced*:

In [None]:
df[pred_feat].value_counts()

#  0 L, 1 M, 2 H

In [None]:
g = sns.countplot(x="Class", data=df)

Class = 1 (Medium) twice the size of each of the other two... let us rebalance.

since we have few data points, we amplify the minority classes using SMOTE

In [None]:
# Unbalanced train set
X = df.drop([pred_feat], axis=1).values
CL = df[pred_feat].values

In [None]:
# rebalance through upsampling
X_reb, CL_reb = SMOTE().fit_resample(X, CL)

print(f"Unbalanced Counts:\n{Counter(CL)}\n")
print(f"Balanced Counts:\n{Counter(CL_reb)}")

In [None]:
# Split data in train / tet
XTrain, XTest, CLTrain, CLTest = train_test_split(X_reb, CL_reb, test_size = 0.33, random_state=seed)

In [None]:
print(f"Training Set Counts:\n{Counter(CLTrain)}\n")
print(f"Test Set Counts:\n{Counter(CLTest)}")

In [None]:
## scaling
scaler = StandardScaler()
XTrain, XTest  = (scaler.fit_transform(d)
                  for d in (XTrain, XTest))

### first attempt at modelling: linear model using _logistic regression_

In [None]:
clf = LogisticRegression(penalty='l2', C=1, solver='lbfgs',multi_class='auto', max_iter=10000)
clf.fit(XTrain, CLTrain)

# predictions on training set
CL_pred_Train = clf.predict(XTrain)

# predictions on test set
CL_pred_Test = clf.predict(XTest)

print('Train Set Predictions Report:\n')
print(classification_report(CLTrain, CL_pred_Train))
print('Test Set Predictions Report:\n')
print(classification_report(CLTest, CL_pred_Test))

scores = cross_val_score(clf, XTrain, CLTrain, cv=3)
print(f"Cross-validation scores: {scores.mean():.2f} (± {scores.std() * 2:.2f})")

## Hyper-parameter tuning


note that logit has one hyper-parameter: `C`

can we improve on performance by tuning this _hyper-parameter_?

how do we select the /optimal/ values for these hyper-parameters?

*note*: the code below is from the scikit doc on [Parameter estimation using grid search with cross-validation](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html)

In [None]:
# Set the parameters by cross-validation
tuned_parameters = [{'C': [1, 10, 100]}]

clf = GridSearchCV(LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000), tuned_parameters, cv=5)
clf.fit(XTrain, CLTrain)

print("Best parameters set found on development set:\n")
print(clf.best_params_)
print("\nGrid scores on development set:\n")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print(f"{mean:.3f} (± {std * 2:.03f}) for {params}")

### Second attempt: SVM linear  (Support Vector Machines)

In [None]:
svm = LinearSVC(C=1, max_iter=10000)
clf = svm.fit(XTrain, CLTrain)

# predictions on training set
CL_pred_Train = clf.predict(XTrain)

# predictions on test set
CL_pred_Test = clf.predict(XTest)
    
print('Train Set Predictions Report:\n')
print(classification_report(CLTrain, CL_pred_Train))
print('Test Set Predictions Report:\n')
print(classification_report(CLTest, CL_pred_Test))

scores = cross_val_score(clf, XTrain, CLTrain, cv=3)
print(f"Cross-validation scores: {scores.mean():.2f} (± {scores.std() * 2:.2f})")

### Let's try a non-linear classifier: SVM with polynomial kernel

In [None]:
clf = SVC(kernel='poly')
clf.fit(XTrain, CLTrain)

CL_pred_Train = clf.predict(XTrain)
CL_pred_Test = clf.predict(XTest)

print('Train Set Predictions Report:\n')
print(classification_report(CLTrain, CL_pred_Train))
print('Test Set Predictions Report:\n')
print(classification_report(CLTest, CL_pred_Test))

scores = cross_val_score(clf, XTrain, CLTrain, cv=3)
print(f"Cross-validation scores: {scores.mean():.2f} (± {scores.std() * 2:.2f})")

## SVM with RBF kernel

In [None]:
clf = SVC(kernel='rbf')
clf.fit(XTrain, CLTrain)

CL_pred_Train = clf.predict(XTrain)
CL_pred_Test = clf.predict(XTest)

print('Train Set Predictions Report:\n')
print(classification_report(CLTrain, CL_pred_Train))
print('Test Set Predictions Report:\n')
print(classification_report(CLTest, CL_pred_Test))

scores = cross_val_score(clf, XTrain, CLTrain, cv=3)
print(f"Cross-validation scores: {scores.mean():.2f} (± {scores.std() * 2:.2f})")

## Tune the hyper-parameters by cross-validation

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100]},
                    {'kernel': ['poly'], 'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]

clf = GridSearchCV(SVC(), tuned_parameters, cv=5)
clf.fit(XTrain, CLTrain)

print("Best parameters set found on development set:\n")
print(clf.best_params_)
print("\nGrid scores on development set:\n")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print(f"{mean:.3f} (± {std * 2:.03f}) for {params}")

## decision trees are attractive but may overfit the data

In [None]:
clf = DecisionTreeClassifier(max_depth=10)
clf = clf.fit(XTrain, CLTrain)

# predictions
CL_pred_Train = clf.predict(XTrain)
CL_pred_Test = clf.predict(XTest)

print('Train Set Predictions Report\n')
print(classification_report(CLTrain, CL_pred_Train))
print('Test Set Predictions Report\n')
print(classification_report(CLTest, CL_pred_Test))

scores = cross_val_score(clf, XTrain, CLTrain, cv=3)
print(f"Cross-validation scores: {scores.mean():.2f} (± {scores.std() * 2:.2f})")

## to mitigate overfitting, we introduce _ensemble models_: Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators=10, max_depth=50, random_state=seed)
clf.fit(XTrain, CLTrain)

# predictions
CL_pred_Train = clf.predict(XTrain)
CL_pred_Test = clf.predict(XTest)

print('Train Set Predictions Report:\n')
print(classification_report(CLTrain, CL_pred_Train))
print('Test Set Predictions Report:\n')
print(classification_report(CLTest, CL_pred_Test))

scores = cross_val_score(clf, XTrain, CLTrain, cv=3)
print(f"Cross-validation scores: {scores.mean():.2f} (± {scores.std() * 2:.2f})")

## let us tune the hyper-parameters `max_depth` and  `n_estimators`

ref: https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html

A search consists of:

- an estimator (regressor or classifier such as sklearn.svm.SVC());
- a parameter space;
- a method for searching or sampling candidates;
- a cross-validation scheme; and
- a score function.

By default, parameter search uses the score function of the estimator to evaluate a parameter setting.

In [None]:
# scores = ['precision', 'recall']

tuned_parameters = [{'n_estimators': [10,50,100,200],
                     'max_depth': [5,10,20,50]}]
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5)
clf.fit(XTrain, CLTrain)

print("Best parameters set found on development set:\n")
print(clf.best_params_)
print("\nGrid scores on development set:\n")

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print(f"{mean:.3f} (± {std * 2:.03f}) for {params}\n")