In [108]:
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
import numpy as np
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.metrics import f1_score #f1 score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np

In [8]:
X,y = make_classification(n_samples=1000, n_redundant=0, n_repeated=0, n_classes=4, n_clusters_per_class=1,
                            weights=[0.05, 0.10, 0.15, 0.7], n_features=10, random_state=100)
print('Original Data %s' % Counter(y))

Original Data Counter({3: 692, 2: 154, 1: 101, 0: 53})


In [14]:
rus = RandomUnderSampler(random_state=100)
rus.fit(X, y)
X_usampled, y_usampled = rus.fit_resample(X, y)
print('Resampled Data from UnderSampling %s' % Counter(y_usampled))
print('X shape:',X_usampled.shape,", y shape:",y_usampled.shape)

Resampled Data from UnderSampling Counter({0: 53, 1: 53, 2: 53, 3: 53})
X shape: (212, 10) , y shape: (212,)


In [24]:
sm=SMOTE(k_neighbors=5, random_state=100)
X_sm, y_sm = sm.fit_resample(X,y)
ada=ADASYN(n_neighbors=5, random_state=100)
X_ada, y_ada = ada.fit_resample(X,y)
print('Resampled Data from SMOTE OverSampling %s' % Counter(y_sm))
print('X shape:',X_sm.shape,", y shape:",y_sm.shape)
print('Resampled Data from ADASYN OverSampling %s' % Counter(y_ada))
print('X shape:',X_ada.shape,", y shape:",y_ada.shape)

Resampled Data from SMOTE OverSampling Counter({2: 692, 3: 692, 1: 692, 0: 692})
X shape: (2768, 10) , y shape: (2768,)
Resampled Data from ADASYN OverSampling Counter({1: 694, 3: 692, 2: 689, 0: 683})
X shape: (2758, 10) , y shape: (2758,)


In [76]:
def logistic_cv(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)
    param_grid_logistic={'C':[1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]}
    logistic_cv=GridSearchCV(LogisticRegression(penalty = 'l2'), param_grid_logistic, cv=5)
    logistic_cv.fit(X_train,y_train)
    print(logistic_cv.best_params_)
    logistic_train=logistic_cv.predict(X_train)
    logistic_test=logistic_cv.predict(X_test)
    print("Train:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_train,logistic_train),2),'   /' ,
          round(recall_score(y_train, logistic_train, average='macro'),2),'   /' ,
          round(f1_score(y_train,logistic_train, average='macro'),2))
    print("Test:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_test,logistic_test),2),'   /' ,
          round(recall_score(y_test, logistic_test, average='macro'),2),'   /' ,
          round(f1_score(y_test,logistic_test, average='macro'),2))
    scores = cross_val_score(logistic_cv, X, y, cv=5) 
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [77]:
logistic_cv(X,y)

{'C': 10.0}
Train:
 Accuracy / Recall / F1 score  
 0.88    / 0.75    / 0.8
Test:
 Accuracy / Recall / F1 score  
 0.85    / 0.73    / 0.76
CV accuracy: 0.860 +/- 0.030


In [78]:
logistic_cv(X_usampled,y_usampled)

{'C': 0.1}
Train:
 Accuracy / Recall / F1 score  
 0.8    / 0.79    / 0.79
Test:
 Accuracy / Recall / F1 score  
 0.78    / 0.8    / 0.79
CV accuracy: 0.751 +/- 0.072


In [79]:
logistic_cv(X_sm, y_sm)

{'C': 0.1}
Train:
 Accuracy / Recall / F1 score  
 0.85    / 0.85    / 0.85
Test:
 Accuracy / Recall / F1 score  
 0.82    / 0.83    / 0.83
CV accuracy: 0.839 +/- 0.022


In [80]:
logistic_cv(X_ada, y_ada)

{'C': 1}
Train:
 Accuracy / Recall / F1 score  
 0.8    / 0.8    / 0.8
Test:
 Accuracy / Recall / F1 score  
 0.79    / 0.8    / 0.8
CV accuracy: 0.787 +/- 0.019


In [103]:
def lda(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)
    lda = LinearDiscriminantAnalysis(store_covariance=False)
    lda.fit(X_train,y_train)
    lda_train=lda.predict(X_train)
    lda_test=lda.predict(X_test)
    print("Train:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_train,lda_train),2),'   /' ,
          round(recall_score(y_train, lda_train, average='macro'),2),'   /' ,
          round(f1_score(y_train,lda_train, average='macro'),2))
    print("Test:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_test,lda_test),2),'   /' ,
          round(recall_score(y_test, lda_test, average='macro'),2),'   /' ,
          round(f1_score(y_test,lda_test, average='macro'),2))
    scores = cross_val_score(lda, X, y, cv=5) 
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [104]:
lda(X,y)

Train:
 Accuracy / Recall / F1 score  
 0.84    / 0.66    / 0.68
Test:
 Accuracy / Recall / F1 score  
 0.82    / 0.63    / 0.65
CV accuracy: 0.832 +/- 0.025


In [105]:
lda(X_usampled, y_usampled)

Train:
 Accuracy / Recall / F1 score  
 0.78    / 0.76    / 0.76
Test:
 Accuracy / Recall / F1 score  
 0.75    / 0.77    / 0.76
CV accuracy: 0.727 +/- 0.050


In [106]:
lda(X_sm, y_sm)

Train:
 Accuracy / Recall / F1 score  
 0.8    / 0.8    / 0.79
Test:
 Accuracy / Recall / F1 score  
 0.8    / 0.8    / 0.8
CV accuracy: 0.798 +/- 0.019


In [107]:
lda(X_ada, y_ada)

Train:
 Accuracy / Recall / F1 score  
 0.77    / 0.77    / 0.77
Test:
 Accuracy / Recall / F1 score  
 0.76    / 0.77    / 0.76
CV accuracy: 0.757 +/- 0.030


In [90]:
def svm_cv(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)
    param_grid_svm={'C':[10**(i+1) for i in range(-3,2)], 'gamma':[10**(i+1) for i in range(-3,2)],'kernel':['rbf','sigmoid']}
    svm_cv=GridSearchCV(SVC(random_state=1206), param_grid_svm, cv=5)
    svm_cv.fit(X_train,y_train)
    print(svm_cv.best_params_)
    svm_train=svm_cv.predict(X_train)
    svm_test=svm_cv.predict(X_test)
    print("Train:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_train,svm_train),2),'   /' ,
          round(recall_score(y_train, svm_train, average='macro'),2),'   /' ,
          round(f1_score(y_train,svm_train, average='macro'),2))
    print("Test:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_test,svm_test),2),'   /' ,
          round(recall_score(y_test, svm_test, average='macro'),2),'   /' ,
          round(f1_score(y_test,svm_test, average='macro'),2))
    scores = cross_val_score(svm_cv, X, y, cv=5) 
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [91]:
svm_cv(X,y)

{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Train:
 Accuracy / Recall / F1 score  
 0.9    / 0.79    / 0.85
Test:
 Accuracy / Recall / F1 score  
 0.9    / 0.78    / 0.84
CV accuracy: 0.887 +/- 0.028


In [92]:
svm_cv(X_usampled, y_usampled)

{'C': 10, 'gamma': 0.01, 'kernel': 'sigmoid'}
Train:
 Accuracy / Recall / F1 score  
 0.8    / 0.8    / 0.8
Test:
 Accuracy / Recall / F1 score  
 0.84    / 0.85    / 0.85
CV accuracy: 0.745 +/- 0.065


In [93]:
svm_cv(X_sm, y_sm)

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.95    / 0.95    / 0.95
CV accuracy: 0.978 +/- 0.008


In [94]:
svm_cv(X_ada, y_ada)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.96    / 0.96    / 0.96
CV accuracy: 0.960 +/- 0.010


In [98]:
def rf_cv(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)
    param_grid_rf={'criterion':["gini","entropy"] ,'max_features':['sqrt','log2',X_train.shape[1]]}
    rf_cv=GridSearchCV(RandomForestClassifier(random_state=100), param_grid_rf, cv=5)
    rf_cv.fit(X_train,y_train)
    print(rf_cv.best_params_)
    rf_train=rf_cv.predict(X_train)
    rf_test=rf_cv.predict(X_test)
    print("Train:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_train,rf_train),2),'   /' ,
          round(recall_score(y_train, rf_train, average='macro'),2),'   /' ,
          round(f1_score(y_train,rf_train, average='macro'),2))
    print("Test:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_test,rf_test),2),'   /' ,
          round(recall_score(y_test, rf_test, average='macro'),2),'   /' ,
          round(f1_score(y_test,rf_test, average='macro'),2))
    scores = cross_val_score(rf_cv, X, y, cv=5) 
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [99]:
rf_cv(X,y)

{'criterion': 'gini', 'max_features': 'sqrt'}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.92    / 0.82    / 0.87
CV accuracy: 0.919 +/- 0.024


In [100]:
rf_cv(X_usampled,y_usampled)

{'criterion': 'entropy', 'max_features': 10}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.81    / 0.82    / 0.83
CV accuracy: 0.783 +/- 0.039


In [101]:
rf_cv(X_sm,y_sm)

{'criterion': 'entropy', 'max_features': 10}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.95    / 0.95    / 0.95
CV accuracy: 0.962 +/- 0.011


In [102]:
rf_cv(X_ada,y_ada)

{'criterion': 'gini', 'max_features': 'sqrt'}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.97    / 0.97    / 0.97
CV accuracy: 0.938 +/- 0.032


In [109]:
def xgb_cv(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)
    param_grid_xgb={'max_depth': [3, 5, 7, 9], 'n_estimators': [5, 10, 15, 20, 25, 50, 100],'learning_rate': [0.01, 0.05, 0.1]}
    xgb_cv=GridSearchCV(xgb.XGBClassifier(random_state=100), param_grid_xgb, cv=5)
    xgb_cv.fit(X_train,y_train)
    print(xgb_cv.best_params_)
    xgb_train=xgb_cv.predict(X_train)
    xgb_test=xgb_cv.predict(X_test)
    print("Train:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_train,xgb_train),2),'   /' ,
          round(recall_score(y_train, xgb_train, average='macro'),2),'   /' ,
          round(f1_score(y_train,xgb_train, average='macro'),2))
    print("Test:\n","Accuracy / Recall / F1 score  \n",round(accuracy_score(y_test,xgb_test),2),'   /' ,
          round(recall_score(y_test, xgb_test, average='macro'),2),'   /' ,
          round(f1_score(y_test,xgb_test, average='macro'),2))
    scores = cross_val_score(xgb_cv, X, y, cv=5) 
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [110]:
xgb_cv(X,y)

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Train:
 Accuracy / Recall / F1 score  
 0.97    / 0.93    / 0.95
Test:
 Accuracy / Recall / F1 score  
 0.92    / 0.85    / 0.89
CV accuracy: 0.918 +/- 0.026


In [111]:
xgb_cv(X_usampled,y_usampled)

{'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 25}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.78    / 0.8    / 0.79
CV accuracy: 0.774 +/- 0.042


In [112]:
xgb_cv(X_sm,y_sm)

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.96    / 0.96    / 0.96
CV accuracy: 0.953 +/- 0.016


In [113]:
xgb_cv(X_ada,y_ada)

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
Train:
 Accuracy / Recall / F1 score  
 1.0    / 1.0    / 1.0
Test:
 Accuracy / Recall / F1 score  
 0.96    / 0.96    / 0.96
CV accuracy: 0.930 +/- 0.023


In [None]:
print(hi)
