In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso,LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC,SVR
from tqdm.notebook import tqdm as tqdm
from sklearn import decomposition
import seaborn as sns
from sklearn.manifold import TSNE, LocallyLinearEmbedding, MDS
from pandas.plotting import scatter_matrix
from sklearn import datasets
plt.style.use("bmh")

# Students Performance Analysis

### Q.1 

Read the attached "students.csv" file to a DataFrame.

### Q.2

Print the features name and type.

### Q.3

#### 3.1

Perform feature encoding on the categorical features. <br/>
For each categorical feature, create LabelEncoder and transform its values.

In [None]:
categorical_features = data.select_dtypes(include=["object"]).columns

#### 3.2

Print the mapping values of "NationalITy" feature.

### Q.4

#### 4.1
Use the method 'evaluate_RF' we have implemented in ensemble learning practice and evaluate it on the students dataset.

In [None]:
def evaluate_RF(X, y, n_estimators, max_depth=[2,8,16,32]):
    res = {}
    for md in max_depth:
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=md, random_state=1)
        res['rf_'+str(md)] = np.mean(cross_val_score(rf,X,y,cv=3,scoring="accuracy"))
    return res

#### 4.2

Use the hyperparameters discovered in 4.1 and display the importance of each feature.

In [None]:
RF = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=1)
RF.fit(X,y)
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": RF.feature_importances_ * 100
})
importances = importances.sort_values("Importance", ascending=False)
importances

#### 4.3

Drop the least important feature.

#### 4.4 

Use <strong>Automated Feature Selection (AFS)</strong> based on GradientBoostingClassifier and find the most important features using the <strong>median</strong> threshold.

In [None]:
select = SelectFromModel(GradientBoostingClassifier(n_estimators=200, random_state=42), threshold="median")
select.fit(X, y)
mask = select.get_support()
result = pd.DataFrame({
    "Features": X.columns,
    "Selected": mask
})
result

#### 4.5

Create the "selected_features" list with features that satisfy "Selected = True".

### Q.5

#### 5.1

Create new DataFrame using the selected features, and split to (X,y).

#### 5.2

Implement the function  `search_for_hyperparameters(X, y, model_name, model, param_grid)`

That gets as input:
* X - dataset
* y - target variable
* model_name - string
* model - Object
* param_grid

and perform GridSearchCV


In [None]:
def search_for_hyperparameters(X, y, model_name, model, param_grid):
    pipe = Pipeline([(model_name, model)])
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
    grid.fit(X, y)
    print("Test set score: {:.2f}".format(grid.score(new_df, target)))
    print("Best parameters: {}".format(grid.best_params_))

#### 5.3

Test your method on: AdaBoostClassifier, RandomForestClassifier, SVC

* AdaBoostClassifier: learning_rate
* RandomForestClassifier: n_estimators, max_depth
* SVM: C, gamma

### Q.6

Use the new dataset and display the plots of PCA, tSNE, and MDS.

In [None]:
def plot_PCA(data, labels):
    pca = decomposition.PCA(n_components = 2)
    pca_data = pca.fit_transform(data)
    pca_data = np.vstack((pca_data.T, labels)).T
    df = pd.DataFrame(data=pca_data, columns=("dim1", "dim2", "Label"))
    sns.FacetGrid(df, hue="Label", height=6).map(plt.scatter, 'dim1', 'dim2').add_legend()
    plt.title("PCA")
    plt.show()
    
def plot_tSNE(data, labels, perplexity=[5,20,30,50]):
    kl_divergence_iris = []
    fig, ax = plt.subplots(1, 4, figsize=(40, 8))
    for idx, p in enumerate(perplexity):
        tsne = TSNE(n_components=2, random_state=0, perplexity=p)
        tsne_data = tsne.fit_transform(data)
        kl_divergence_iris.append(tsne.kl_divergence_)
        tsne_data = np.vstack((tsne_data.T, labels)).T
        df = pd.DataFrame(data=tsne_data, columns=("dim1", "dim2", "Label"))
        sns.scatterplot(x=df['dim1'], y=df['dim2'], hue=df["Label"], ax=ax[idx], s=100,palette='colorblind').set_title("t-SNE with perplexity="+str(p))
    plt.show()
    
def plot_MDS(data, labels):
    mds = MDS(n_components=2)
    mds_data = mds.fit_transform(data)
    mds_data_iris = np.vstack((mds_data.T, labels)).T
    df = pd.DataFrame(data=mds_data_iris, columns=("dim1", "dim2", "Label"))
    sns.FacetGrid(df, hue="Label", height=6).map(plt.scatter, 'dim1', 'dim2').add_legend()
    plt.title("MDS with final stress=" + str(mds.stress_))
    plt.show()

### Q.7

Choose one model (based on the result of this practice) and evaulate it on the students dataset. <br/>

Compare your result based on cross-validation-score using cv=5 and cv=10.

In [None]:
cross_val_score(rf, new_df, target, cv=5)

### Q.8

Build a procedure that allow you to predict future data based on your model.

In [None]:
def predict_student_performance(le_map, model, topic, relation, hands, resources, anno, discussion, parents, absence):
    to_predict = [
        le_map["Topic"].transform([topic])[0], 
        le_map["Relation"].transform([relation])[0],
        hands,
        resources,
        anno,
        discussion,
        le_map["ParentAnsweringSurvey"].transform([parents])[0],
        le_map["StudentAbsenceDays"].transform([absence])[0],
    ]
    predicted_value = model.predict([to_predict])[0]
    return le_map["Class"].inverse_transform([predicted_value])

In [None]:
predict_student_performance(le_map, rf, "Geology", "Father", 17, 22, 3, 45, "Yes", "Above-7")

In [None]:
predict_student_performance(le_map, rf, "Chemistry", "Mum", 50, 40, 1, 30, "Yes", "Above-7")

In [None]:
predict_student_performance(le_map, rf, "Math", "Mum", 120, 100, 5, 100, "Yes", "Under-7")