<a href="https://colab.research.google.com/github/Saheiyanda/Harmoye-Project/blob/main/UCL_StageC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Importing useful libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, \
                    LeaveOneOut, KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
import sklearn.utils

from imblearn.over_sampling import SMOTE

import xgboost
import lightgbm

import warnings
warnings.filterwarnings('ignore')


In [7]:
# Loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Data_for_UCI_Stage C project.csv')
df.head()


Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [8]:
# Checking the shape of the dataset
df.shape

(10000, 14)

In [9]:
# Checking for missing values
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [10]:
# since the 'stab f' is is an object which is categorical.
# let us code it for easy readeability 
le = LabelEncoder()
df['stabf'] = le.fit_transform(df['stabf'])
df['stabf']


0       1
1       0
2       1
3       1
4       1
       ..
9995    1
9996    0
9997    0
9998    1
9999    1
Name: stabf, Length: 10000, dtype: int64

In [11]:
# Dropping 'stab' column as instructed
df = df.drop('stab', axis=1)
df.head()
df.stabf
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,1
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,0
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,1
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,1
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,1


In [12]:
y_train = df.pop('stabf') # Pops out the stabf column as the label
X_train = df # Uses the remaining columns as the features
X_train.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923


In [13]:
y_train

0       1
1       0
2       1
3       1
4       1
       ..
9995    1
9996    0
9997    0
9998    1
9999    1
Name: stabf, Length: 10000, dtype: int64

In [14]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923


In [15]:
#It can be infered that the classes are not balanced
# # let us balance it by using SMOTE Package
smote = SMOTE(random_state=1)
x_train_balanced, y_balanced = smote.fit_resample(X_train, y_train)

In [16]:
y_balanced.value_counts()

1    6380
0    6380
Name: stabf, dtype: int64

In [17]:
# Splitting the data into 80:20 training and testing test with a random_state of 1
x_train_balanced, x_test, y_balanced, y_test = train_test_split(x_train_balanced,y_balanced, test_size=0.2, random_state=1) 

In [18]:
scaler = StandardScaler() # Initializes a StandardScaler object
scaled_x_train = scaler.fit_transform(x_train_balanced) # Fits and transform the training set
scaled_x_test = scaler.transform(x_test) # Transforms the testing set

**Let's create a function that returns the metric score of a test set. The metric can be any of accuracy_score, precision_score, recall_score, f1_score and confusion matrix.**

In [19]:
# First, let's create a dictionary of the metrics we will be using
metrics = {'accuracy_score': accuracy_score, 'precision_score': precision_score, 'recall_score': recall_score, 
               'f1_score': f1_score, 'confusion_matrix': confusion_matrix}

In [20]:
# Defining the function
def get_metric_score(metric, ytrue, ypred, neg_pos_label):
    ''' This function returns the specified metric score. It only works with classifier metrics.
        
        Args:   metric (string): the evaluating metric, can be any of accuracy_score, precision_score, recall_score, f1_score, 
                                 or confusion matrix.
                ytrue (array): the true labels
                ypred (array): the predicted labels
                neg_pos_label (list): a list of the classes you want as the negative and positive label in order 
                                      of [negative_label, positive_label]
                
        Return: returns the metric score
    '''
    
    if metric == 'accuracy_score':
        return accuracy_score(ytrue, ypred)
    
    elif metric == 'confusion_matrix':
        return confusion_matrix(ytrue, ypred)
    
    else:
        return metrics[metric](ytrue, ypred, pos_label=neg_pos_label[1]) # this is done because precision, recall and f1_score
                                                                         # takes the same arguments

**Let's create a that function fits a classifier on a training set and prints out the accuracy_score, precision_score, recall_score, f1_score and confusion matrix of the testing set.**

In [21]:
# Defining the function
def fit_and_score(classifier, xtrain, ytrain, xtest, ytest, neg_pos_label):
    ''' This function fits a classifier on a training set and prints out the accuracy_score, precision_score, recall_score, 
    f1_score and confusion matrix of the testing set.
    
    Args: classifier (classifier object): the classifier you want to use
          xtrain (ndarray): the training features
          ytrain (array): the training labels
          xtest (ndarray): the testing features
          ytest (array): the testing labels
          neg_pos_label (list): a list of the classes you want as the negative and positive label in order 
                                      of [negative_label, positive_label]
    '''
    classifier.fit(xtrain, ytrain) # fits the classifier
    ypred = classifier.predict(xtest) # predicts
    
    # for each metric in metrics (dictionary earlier defined), print out the metric score.
    for metric in metrics:
        
        # this 'if' block is to ensure that the confusion matrix is properly printed out to improve redability
        if metric == 'confusion_matrix':
            print()
            print('confusion_matrix is:')
            print(get_metric_score(metric, y_test, ypred, neg_pos_label))
            
        else:
            print('{} is {}'.format(metric, get_metric_score(metric, y_test, ypred, neg_pos_label)))


In [22]:
label_list = [0, 1] # this is to be used as the neg_pos_label needed in fit_and_score function

**Let's evaluate our model on different classifiers.**

**Training and testing on RandomForestClassifie**r

In [23]:
#q5

rnd_forest = RandomForestClassifier(random_state=1)
fit_and_score(rnd_forest, scaled_x_train, y_balanced, scaled_x_test, y_test, label_list)

accuracy_score is 0.9416144200626959
precision_score is 0.9591018444266239
recall_score is 0.9242658423493045
f1_score is 0.9413616686343961

confusion_matrix is:
[[1207   51]
 [  98 1196]]


In [24]:
extra_trees = ExtraTreesClassifier(random_state=1)
fit_and_score(extra_trees, scaled_x_train, y_balanced, scaled_x_test, y_test, label_list)


accuracy_score is 0.9463166144200627
precision_score is 0.9573122529644269
recall_score is 0.9358578052550232
f1_score is 0.9464634622899569

confusion_matrix is:
[[1204   54]
 [  83 1211]]


In [25]:
lgbm = lightgbm.LGBMClassifier(random_state=1)
fit_and_score(lgbm, scaled_x_train, y_balanced, scaled_x_test, y_test, label_list)

accuracy_score is 0.9529780564263323
precision_score is 0.96513470681458
recall_score is 0.9412673879443586
f1_score is 0.9530516431924883

confusion_matrix is:
[[1214   44]
 [  76 1218]]


In [26]:
#question 5
xgb = xgboost.XGBClassifier(random_state=1)
fit_and_score(xgb, scaled_x_train, y_balanced, scaled_x_test, y_test, label_list)

accuracy_score is 0.9255485893416928
precision_score is 0.9360189573459715
recall_score is 0.9157650695517774
f1_score is 0.92578125

confusion_matrix is:
[[1177   81]
 [ 109 1185]]


In [30]:
#12
light_gbm2 = lightgbm.LGBMClassifier(random_state=1) # initializes a light gbm classifier object
light_gbm2.fit(scaled_x_train, y_balanced) # fits the model
ypred_light_gbm2 = light_gbm2.predict(scaled_x_test) # predicts on the testing set
accuracy_score(y_test, ypred_light_gbm2) # ouputs the accuracy

0.9529780564263323

In [33]:
# finding the best hyperparameters
tuned_extra_trees.best_params_

{'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 9,
 'n_estimators': 300}

In [46]:
n_iter = [10]
cv = [5]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]


hyperparameter_grid = {'n_iter': n_iter, 'cv': cv,}
                      


In [None]:
#Question 11
# let's train an ordinary extra trees classifier
extra_trees3 = ExtraTreesClassifier(random_state=1) # initializes an extra trees classifier object
tuned_extra_trees3 = RandomizedSearchCV(random_forest2, hyperparameter_grid, random_state=1, verbose=1, n_jobs=-1)
fit_and_score(tuned_extra_trees3, scaled_x_train, y_balanced, scaled_x_test, y_test, label_list)
# extra_trees3.fit(scaled_x_train, y_balanced) # fits the extra trees

In [35]:
ypred_ordinary = extra_trees3.predict(scaled_x_test) # predicts on the testing set

print('accuracy of ordinary extra trees is {}'.format(accuracy_score(y_test, ypred_ordinary)))

accuracy of ordinary extra trees is 0.9463166144200627


In [36]:
# now let's find the accuracy of the tuned extra trees
ypred_tuned = tuned_extra_trees.best_estimator_.predict(scaled_x_test) # predicts with the best estimator of the tuned extra 
                                                                        # trees
print('accuracy of tuned extra trees is {}'.format(accuracy_score(y_test, ypred_tuned)))

accuracy of tuned extra trees is 0.9482758620689655


In [37]:
random_forest2 = RandomForestClassifier(random_state=1) # initializes a random forest classifier object
tuned_random_forest2 = RandomizedSearchCV(random_forest2, hyperparameter_grid, random_state=1, verbose=1, n_jobs=3)
fit_and_score(tuned_random_forest2, scaled_x_train, y_balanced, scaled_x_test, y_test, label_list)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
accuracy_score is 0.9427899686520376
precision_score is 0.9599358974358975
recall_score is 0.9258114374034003
f1_score is 0.9425649095200629

confusion_matrix is:
[[1208   50]
 [  96 1198]]


In [38]:
tuned_random_forest2.best_params_

{'max_features': 'log2',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [39]:
#question 18
features_importance = tuned_extra_trees.best_estimator_.feature_importances_ # finds the feature importance

In [40]:
most_important_feature = features_importance.max() # finds the most important feature
least_important_feature = features_importance.min() # finds the least important feature

cols = X_train.columns # assigns the features in the data to a variable 'cols'

In [41]:
print('most important feature is {}'.format(cols[features_importance == most_important_feature][0]))

most important feature is tau1


In [42]:
print('least important feature is {}'.format(cols[features_importance == least_important_feature][0]))

least important feature is p1


**Correlated Feature**