# Prediction
Here we try to train a model on (the first 14 days of) a course of year N and then predict the success/failure for the same course of year N+1 (after 14 days passed)

In [53]:
import numpy as np
import pandas as pd
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier, 
                              GradientBoostingClassifier, RandomForestClassifier)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
                            recall_score, f1_score)
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [8]:
from tools.load_oulad import dataset_dict
from tools.validation_oulad import customClusteringScore
import tools.filter_oulad as filter_oulad

### Selecting the data

In [9]:
code_module = 'AAA'
scaler = MinMaxScaler()

#### Training data

In [10]:
code_presentation = '2013J'

oneTrainingCourse = filter_oulad.getOneCourse(dataset_dict, code_module, code_presentation)
training_final_df = filter_oulad.restructure(oneTrainingCourse, 14)
training_encoders = filter_oulad.cleanAndMap(training_final_df)

trainX = training_final_df.drop(['final_result_first'], axis=1)
trainX = scaler.fit_transform(trainX)
trainY = training_final_df['final_result_first']

#### Validation data

In [11]:
code_presentation = '2014J'

oneTestCourse = filter_oulad.getOneCourse(dataset_dict, code_module, code_presentation)
testing_final_df = filter_oulad.restructure(oneTestCourse, 14)
testing_encoders = filter_oulad.cleanAndMap(testing_final_df)

testX = testing_final_df.drop(['final_result_first'], axis=1)
testX = scaler.fit_transform(testX)
testY = testing_final_df['final_result_first']

#### Confusion Matrix row/column names

In [22]:
arrays = [['Prediction']*4, \
          ['Withdrawn', 'Fail', 'Pass', 'Distinction']]
tuples = list(zip(*arrays))
index1 = pd.MultiIndex.from_tuples(tuples, names=['',''])
arrays = [['Truth']*4, \
          ['Withdrawn', 'Fail', 'Pass', 'Distinction']]
tuples = list(zip(*arrays))
index2 = pd.MultiIndex.from_tuples(tuples, names=['',''])

#### Training and result displaying helper function

In [69]:
def train_and_show_results(model):
    model.fit(X=trainX, y=trainY)
    predictionY = model.predict(testX)
    predictionY = pd.Series(predictionY, index=testY.index, name="model")
    print(' Accuracy Score = {0:.2f}'.format(accuracy_score(testY, predictionY)))
    print('Precision Score = {0:.2f}'.format(precision_score(testY, predictionY, 
                                                             average='macro')))
    print('   Recall Score = {0:.2f}'.format(recall_score(testY, predictionY, 
                                                          average='macro')))
    print('       F1 Score = {0:.2f}'.format(f1_score(testY, predictionY, average='macro')))
    print('    CustomScore = {0:.2f}'.format(customClusteringScore(predictionY, testY)))
    display(pd.DataFrame(confusion_matrix(testY, predictionY, labels=[0,1,2,3]), \
                 columns=index1, index=index2))

#### Linear Regression
Yes, probably this makes no sense at all because we are trying to predict a categorical variable ('Distinction', 'Pass', 'Fail', 'Withdrawn') ... 
but let's try it in any case :)


In [70]:
model = LinearRegression()
model.fit(trainX, trainY)
predictionY = model.predict(testX)
predictionY[predictionY>=2.5] = 3
predictionY[predictionY<0.5] = 0
predictionY[(predictionY>=0.5) & (predictionY<=1.5)] = 1
predictionY[(predictionY>1.5) & (predictionY<2.5)] = 2
predictionY = pd.Series(predictionY, index=testY.index, name="LinR")
print(f'Mean accuracy {round(100*sum(predictionY == testY)/len(testY), 2)}')
r2 = model.score(testX, testY)
print(f'R2 score = {r2:0.4f}')
print('CustomScore = {}'.format(customClusteringScore(predictionY, testY)))
pd.DataFrame(confusion_matrix(testY, predictionY, labels=[0,1,2,3]), \
             columns=index1, index=index2)

Mean accuracy 66.86
R2 score = 0.4673
CustomScore = 0.6


Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Withdrawn,Fail,Pass,Distinction
,,,,,
Truth,Withdrawn,27.0,25.0,0.0,0.0
Truth,Fail,0.0,15.0,30.0,0.0
Truth,Pass,0.0,37.0,192.0,0.0
Truth,Distinction,0.0,4.0,20.0,0.0


#### Logistic Regression

In [71]:
model = LogisticRegression()
train_and_show_results(model)

 Accuracy Score = 0.76
Precision Score = 0.52
   Recall Score = 0.45
       F1 Score = 0.45
    CustomScore = 0.68


Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Withdrawn,Fail,Pass,Distinction
,,,,,
Truth,Withdrawn,41.0,0.0,11.0,0.0
Truth,Fail,0.0,2.0,43.0,0.0
Truth,Pass,0.0,4.0,223.0,2.0
Truth,Distinction,0.0,0.0,24.0,0.0


#### KNeighbors Classifier

In [72]:
model = KNN(n_neighbors=11)
train_and_show_results(model)

 Accuracy Score = 0.69
Precision Score = 0.42
   Recall Score = 0.34
       F1 Score = 0.34
    CustomScore = 0.55


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Withdrawn,Fail,Pass,Distinction
,,,,,
Truth,Withdrawn,19.0,1.0,32.0,0.0
Truth,Fail,0.0,0.0,45.0,0.0
Truth,Pass,0.0,5.0,224.0,0.0
Truth,Distinction,0.0,0.0,24.0,0.0


#### Decision Tree

In [73]:
model = DecisionTreeClassifier(max_depth=14)
train_and_show_results(model)

 Accuracy Score = 0.60
Precision Score = 0.51
   Recall Score = 0.55
       F1 Score = 0.50
    CustomScore = 0.74


Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Withdrawn,Fail,Pass,Distinction
,,,,,
Truth,Withdrawn,52.0,0.0,0.0,0.0
Truth,Fail,0.0,28.0,16.0,1.0
Truth,Pass,0.0,89.0,129.0,11.0
Truth,Distinction,0.0,9.0,15.0,0.0


#### Bagging Classifier

In [74]:
dt_params = {
    'criterion': 'entropy',
    'random_state': 11
}
dt = DecisionTreeClassifier(**dt_params)

bc_params = {
    'base_estimator': dt,    
    'n_estimators': 50,    
    'max_samples': 0.5,    
    'random_state': 11,
    'n_jobs': -1
}
model = BaggingClassifier(**bc_params)
train_and_show_results(model)

 Accuracy Score = 0.71
Precision Score = 0.52
   Recall Score = 0.57
       F1 Score = 0.54
    CustomScore = 0.74


Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Withdrawn,Fail,Pass,Distinction
,,,,,
Truth,Withdrawn,52.0,0.0,0.0,0.0
Truth,Fail,0.0,23.0,22.0,0.0
Truth,Pass,0.0,55.0,172.0,2.0
Truth,Distinction,0.0,2.0,22.0,0.0


#### Random Forest

In [75]:
rf_params = {
    'n_estimators': 100,
    'criterion': 'entropy',
    'max_features': 0.5,
    'min_samples_leaf': 10,
    'random_state': 11,
    'n_jobs': -1
}
model = RandomForestClassifier(**rf_params)
train_and_show_results(model)

 Accuracy Score = 0.80
Precision Score = 0.56
   Recall Score = 0.53
       F1 Score = 0.53
    CustomScore = 0.74


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Withdrawn,Fail,Pass,Distinction
,,,,,
Truth,Withdrawn,52.0,0.0,0.0,0.0
Truth,Fail,0.0,8.0,37.0,0.0
Truth,Pass,0.0,9.0,220.0,0.0
Truth,Distinction,0.0,0.0,24.0,0.0


#### Ada Boost

In [76]:
dt_params = {
    'max_depth': 1,
    'random_state': 11
}
dt = DecisionTreeClassifier(**dt_params)
ab_params = {
    'n_estimators': 21,
    'base_estimator': dt,
    'random_state': 11
}
model = AdaBoostClassifier(**ab_params)
train_and_show_results(model)

 Accuracy Score = 0.80
Precision Score = 0.67
   Recall Score = 0.52
       F1 Score = 0.51
    CustomScore = 0.75


Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Withdrawn,Fail,Pass,Distinction
,,,,,
Truth,Withdrawn,52.0,0.0,0.0,0.0
Truth,Fail,0.0,3.0,40.0,2.0
Truth,Pass,0.0,1.0,225.0,3.0
Truth,Distinction,0.0,0.0,23.0,1.0


#### GradientBoosting

In [77]:
gbc_params = {
    'n_estimators': 100,
    'max_depth': 3,
    'min_samples_leaf': 5,
    'random_state': 11
}
model = GradientBoostingClassifier(**gbc_params)
train_and_show_results(model)

 Accuracy Score = 0.64
Precision Score = 0.50
   Recall Score = 0.53
       F1 Score = 0.50
    CustomScore = 0.74


Unnamed: 0_level_0,Unnamed: 1_level_0,Prediction,Prediction,Prediction,Prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,Withdrawn,Fail,Pass,Distinction
,,,,,
Truth,Withdrawn,52.0,0.0,0.0,0.0
Truth,Fail,0.0,20.0,24.0,1.0
Truth,Pass,0.0,63.0,152.0,14.0
Truth,Distinction,0.0,5.0,19.0,0.0


#### SVC

In [None]:
model = LinearSVC()
train_and_show_results(model)

#### Stacked Classifier

In [None]:
# Stacked Classifier
x_train_with_metapreds = np.zeros((trainX.shape[0], trainX.shape[1]+2))
x_train_with_metapreds[:, :-2] = trainX
x_train_with_metapreds[:, -2:] = -1
x_train_with_metapreds
kf = KFold(n_splits=5, random_state=11, shuffle=True)

for train_indices, val_indices in kf.split(trainX):
    kfold_x_train, kfold_x_val = trainX[train_indices], trainX[val_indices]
    kfold_y_train, kfold_y_val = trainY.values[train_indices], trainY.values[val_indices]
    
    svm = LinearSVC(random_state=11, max_iter=1000)
    svm.fit(kfold_x_train, kfold_y_train)
    svm_pred = svm.predict(kfold_x_val)
    
    knn = KNN(n_neighbors=4)
    knn.fit(kfold_x_train, kfold_y_train)
    knn_pred = knn.predict(kfold_x_val)
    
    x_train_with_metapreds[val_indices, -2] = svm_pred
    x_train_with_metapreds[val_indices, -1] = knn_pred
    
    
x_val_with_metapreds = np.zeros((testX.shape[0], testX.shape[1]+2))
x_val_with_metapreds[:, :-2] = testX
x_val_with_metapreds[:, -2:] = -1

svm = LinearSVC(random_state=11, max_iter=1000)
svm.fit(trainX, trainY)

knn = KNN(n_neighbors=4)
knn.fit(trainX, trainY)

svm_pred = svm.predict(testX)
knn_pred = knn.predict(testX)

x_val_with_metapreds[:, -2] = svm_pred
x_val_with_metapreds[:, -1] = knn_pred

lr = LogisticRegression(random_state=11, max_iter=150)
lr.fit(x_train_with_metapreds, trainY)
lr_preds_train = lr.predict(x_train_with_metapreds)
lr_preds_val = lr.predict(x_val_with_metapreds)
print('Stacked Classifier:\n\
> Accuracy on training data = {:.4f}\n\
> Accuracy on validation data = {:.4f}'\
      .format(accuracy_score(y_true=trainY, y_pred=lr_preds_train),\
              accuracy_score(y_true=testY, y_pred=lr_preds_val)))

print('SVM:\n> Accuracy on training data = {:.4f}\n> Accuracy on\
validation data = {:.4f}'
      .format(
             accuracy_score(y_true=trainY, y_pred=svm.predict(trainX)),
             accuracy_score(y_true=testY, y_pred=svm_pred)
      ))

print('kNN:\n> Accuracy on training data = {:.4f}\n> Accuracy on \
validation data = {:.4f}'
      .format(accuracy_score(y_true=trainY, y_pred=knn.predict(trainX)),
              accuracy_score(y_true=testY, y_pred=knn_pred)))

pd.DataFrame(confusion_matrix(testY, lr_preds_val, labels=[0,1,2,3]), \
             columns=index1, index=index2)

#### adding classification result to the test data

In [None]:
testing_encoders['final_result_first'].inverse_transform([0,1,2,3])
sc = SpectralClustering(n_clusters=2, affinity='laplacian')
sc.fit(trainX)

newTrainX = np.empty([len(trainX), len(trainX[0]) + 1])
for i, arr in enumerate(trainX):
    newTrainX[i] = np.append(arr, sc.labels_[i])
    
clustLabel = sc.fit_predict(testX)

newTestX = np.empty([len(testX), len(testX[0]) + 1])
for i, arr in enumerate(testX):
    newTestX[i] = np.append(arr, clustLabel[i])