## Natural Language and Image Processing: Homework 1
*Author: Maxim Solonin*

You should build full ML pipeline and get best AUC.

Important stages in pipeline:

1. Feature selection
2. Cross validation
3. Hyper parameters tuning
4. Compare minimum 2 models

## 1. Intro

In [4]:
# importing libraries
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import jaccard_similarity_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA as skPCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

In [5]:
# reading the data
data = pd.read_csv('C:\\Users\\Maxim\\Desktop\\Education\\GSOM\\Natural Language and Image Processing\\HW\\HW1\dataset_simple.csv',sep=',')

In [6]:
# dividing into train and test sample
data_tr, data_t, y_tr, y_t = train_test_split(data.drop('label',axis=1), data['label'], test_size=0.2, random_state=777)

## 2. Feature selection

Two different approaches to feature selection are tried. One is based on random forest classifier with a threshold of 2 mean values. The other one is applied directly during model fitting and is based on support vector machine. 

In [7]:
sfm = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='2*mean')
sfm.fit(data_tr, y_tr)
n_features = sfm.transform(data_tr).shape[1]

In [8]:
print('Features selected:', n_features)

Features selected: 58


In [9]:
data_tr_feat = sfm.transform(data_tr)
data_t_feat = sfm.transform(data_t)

## 3. Model training

### LinearSVC feature selection

In [10]:
# xgboost classifier
clf1 = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(dual = False, penalty="l2"))),
  ('classification', xgb.XGBClassifier())
])
clf1.fit(data_tr, y_tr)

Pipeline(memory=None,
     steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
        max_features=None, n...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [11]:
print('Accuracy: ', jaccard_similarity_score(y_t, clf1.predict(data_t)))
print('AUC: ', roc_auc_score(y_t,clf1.predict(data_t)))

Accuracy:  0.842
AUC:  0.8418683072590613


In [12]:
# K-Neighbours
clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(dual = False, penalty="l2"))),
  ('classification', KNeighborsClassifier())
])
clf.fit(data_tr, y_tr)

Pipeline(memory=None,
     steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
        max_features=None, n...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])

In [13]:
print('Accuracy: ', jaccard_similarity_score(y_t, clf.predict(data_t)))
print('AUC: ', roc_auc_score(y_t,clf.predict(data_t)))

Accuracy:  0.906
AUC:  0.9059786152301482


In [14]:
#Multi-layer perceptron
clf2 = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(dual = False, penalty="l2"))),
  ('classification', MLPClassifier(solver='sgd',learning_rate = 'adaptive', alpha=1e-7, random_state=1, max_iter=500))
])
clf2.fit(data_tr, y_tr)

Pipeline(memory=None,
     steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
        max_features=None, n...e=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))])

In [15]:
print('Accuracy: ', jaccard_similarity_score(y_t, clf2.predict(data_t)))
print('AUC: ', roc_auc_score(y_t,clf2.predict(data_t)))

Accuracy:  0.922
AUC:  0.9219731910348774


### Random forest feature selection

In [16]:
clfx = xgb.XGBClassifier()
clfx.fit(data_tr_feat,y_tr)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [17]:
print('Accuracy: ', jaccard_similarity_score(y_t, clfx.predict(data_t_feat)))
print('AUC: ', roc_auc_score(y_t,clfx.predict(data_t_feat)))

Accuracy:  0.8625
AUC:  0.8623000428015409


In [18]:
# Multi-layer perceptron
clff = MLPClassifier(solver='sgd',learning_rate = 'adaptive', alpha=1e-7, random_state=1,max_iter=500)
clff.fit(data_tr_feat,y_tr)

MLPClassifier(activation='relu', alpha=1e-07, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [19]:
print('Accuracy: ', jaccard_similarity_score(y_t, clff.predict(data_t_feat)))
print('AUC: ', roc_auc_score(y_t,clff.predict(data_t_feat)))

Accuracy:  0.94
AUC:  0.9399798392742138


In [20]:
# K-Neighbours
clfk = KNeighborsClassifier()
clfk.fit(data_tr_feat,y_tr)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [21]:
print('Accuracy: ', jaccard_similarity_score(y_t, clfk.predict(data_t_feat)))
print('AUC: ', roc_auc_score(y_t,clfk.predict(data_t_feat)))

Accuracy:  0.784
AUC:  0.7838242176718362


### Cross-validation

Apart from checking results on ordinary test sample, we apply cross-validation process.

#### Xgboost

In [22]:
train_scores = cross_val_score(clf1, data_tr, y_tr, scoring='roc_auc', cv=5)
test_scores = cross_val_score(clf1, data_t, y_t, scoring='roc_auc', cv=5)
print("Train Fold AUC Scores SVC: ", train_scores)
print("Train CV AUC Score SVC: ", train_scores.mean())
print("\nTest Fold AUC Scores SVC: ", test_scores)
print("Test CV AUC Score SVC: ", test_scores.mean())

Train Fold AUC Scores SVC:  [0.91236162 0.89837882 0.9124004  0.88822835 0.90234162]
Train CV AUC Score SVC:  0.902742163385342

Test Fold AUC Scores SVC:  [0.81974228 0.8239706  0.8361459  0.83562089 0.85567114]
Test CV AUC Score SVC:  0.834230161684635


In [23]:
train_scoresx = cross_val_score(clfx, data_tr_feat, y_tr, scoring='roc_auc', cv=5)
test_scoresx = cross_val_score(clfx, data_t_feat, y_t, scoring='roc_auc', cv=5)
print("Train Fold AUC Scores RF: ", train_scoresx)
print("Train CV AUC Score RF: ", train_scoresx.mean())
print("\nTest Fold AUC Scores RF: ", test_scoresx)
print("Test CV AUC Score RF: ", test_scoresx.mean())

Train Fold AUC Scores RF:  [0.92769619 0.92173657 0.93255394 0.93036943 0.93042107]
Train CV AUC Score RF:  0.9285554383068273

Test Fold AUC Scores RF:  [0.92024479 0.89942249 0.92669817 0.90609765 0.90419117]
Test CV AUC Score RF:  0.9113308518281513


#### MLP

In [24]:
train_scores_mlp = cross_val_score(clf2, data_tr, y_tr, scoring='roc_auc', cv=5)
test_scores_mlp = cross_val_score(clf2, data_t, y_t, scoring='roc_auc', cv=5)
print("Train Fold AUC Scores SVC: ", train_scores_mlp)
print("Train CV AUC Score SVC: ", train_scores_mlp.mean())
print("\nTest Fold AUC Scores SVC: ", test_scores_mlp)
print("Test CV AUC Score SVC: ", test_scores_mlp.mean())

Train Fold AUC Scores SVC:  [0.9610673  0.9564891  0.97100043 0.9505574  0.95051201]
Train CV AUC Score SVC:  0.9579252476404749

Test Fold AUC Scores SVC:  [0.84795263 0.86952174 0.86907173 0.89809745 0.89072315]
Test CV AUC Score SVC:  0.8750733407299081


In [25]:
train_scores_mlp2 = cross_val_score(clff, data_tr_feat, y_tr, scoring='roc_auc', cv=5)
test_scores_mlp2 = cross_val_score(clff, data_t_feat, y_t, scoring='roc_auc', cv=5)
print("Train Fold AUC Scores RF: ", train_scores_mlp2)
print("Train CV AUC Score RF: ", train_scores_mlp2.mean())
print("\nTest Fold AUC Scores RF: ", test_scores_mlp2)
print("Test CV AUC Score RF: ", test_scores_mlp2.mean())

Train Fold AUC Scores RF:  [0.97845449 0.97551058 0.98026896 0.97612527 0.97383592]
Train CV AUC Score RF:  0.9768390457883236

Test Fold AUC Scores RF:  [0.94512165 0.93339833 0.94237356 0.94922373 0.93662998]
Test CV AUC Score RF:  0.94134945082798


#### K-neigbours

In [26]:
train_scores_knn = cross_val_score(clf, data_tr, y_tr, scoring='roc_auc', cv=5)
test_scores_knn = cross_val_score(clf, data_t, y_t, scoring='roc_auc', cv=5)
print("Train Fold AUC Scores SVC: ", train_scores_knn)
print("Train CV AUC Score SVC: ", train_scores_knn.mean())
print("\nTest Fold AUC Scores SVC: ", test_scores_knn)
print("Test CV AUC Score SVC: ", test_scores_knn.mean())

Train Fold AUC Scores SVC:  [0.93152281 0.91918991 0.9431831  0.92011113 0.91008992]
Train CV AUC Score SVC:  0.9248193738294953

Test Fold AUC Scores SVC:  [0.82377233 0.85410885 0.84122103 0.8279957  0.85089703]
Test CV AUC Score SVC:  0.8395989880245056


In [27]:
train_scores_knn2 = cross_val_score(clfk, data_tr_feat, y_tr, scoring='roc_auc', cv=5)
test_scores_knn2 = cross_val_score(clfk, data_t_feat, y_t, scoring='roc_auc', cv=5)
print("Train Fold AUC Scores SVC: ", train_scores_knn2)
print("Train CV AUC Score SVC: ", train_scores_knn2.mean())
print("\nTest Fold AUC Scores SVC: ", test_scores_knn2)
print("Test CV AUC Score SVC: ", test_scores_knn2.mean())

Train Fold AUC Scores SVC:  [0.82942711 0.83618203 0.82315693 0.82923711 0.82924337]
Train CV AUC Score SVC:  0.8294493103387908

Test Fold AUC Scores SVC:  [0.74440271 0.7679817  0.74438111 0.75028126 0.7272476 ]
Test CV AUC Score SVC:  0.7468588746171799


For all the tested models except for knn, performance result is better for feature selection using random forest.

### Tuning of parameters using GridSearch

Now, parameters tuning is applied using GridSearch with default cross-validation.

In [32]:
def hyper_tun(model,params,x_tr,y_tr,x_t,y_t):
    md = GridSearchCV(model, params, scoring='roc_auc', n_jobs=-1, cv = 3)
    md.fit(x_tr,y_tr)
    print(md.best_estimator_)
    print('ROC-AUC: ', roc_auc_score(y_t, md.best_estimator_.predict(x_t)))

#### KNN

In [29]:
knn_params = {
 'n_neighbors':range(1,10,2),
 'p': [1, 2]
}

In [33]:
hyper_tun(KNeighborsClassifier(),knn_params,data_tr,y_tr,data_t,y_t)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=9, p=1,
           weights='uniform')
ROC-AUC:  0.9058646111260005


#### MLP

In [37]:
mlp_params = {
'solver': ['sgd'], 'max_iter': [200,500,700,900, 1000],'hidden_layer_sizes':np.arange(100, 200, 20)
}

In [38]:
hyper_tun(MLPClassifier(learning_rate='adaptive',alpha=1e-7),mlp_params,data_tr_feat,y_tr,data_t_feat,y_t)

MLPClassifier(activation='relu', alpha=1e-07, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=180, learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=700, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
ROC-AUC:  0.9394108187894764


#### XGBoost

In [45]:
params_xgb={
    'max_depth': [3,4,5,6,7],
    'n_estimators': [100,300,500,700,1000]
}


In [46]:
hyper_tun(xgb.XGBClassifier(n_jobs = -1),params_xgb,data_tr_feat,y_tr,data_t_feat,y_t)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
ROC-AUC:  0.9334226032137156


## 4. Conclusion. Model comparison

All the models tested demonstrated good results regarding ROC-AUC metric. The best results using cross-validation and GridSearch were demonstrated by multi-layer perceptron with ROC-AUC 0.94. At the same time xgboost model is very close with 0.933.