In [2]:
#importing the neccessary libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier 
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix 

In [3]:
#Import Dataset
grid_stability =  pd.read_csv('Data_for_UCI_named.csv')

In [4]:
#print the dataframe 
grid_stability

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,0.049860,unstable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803,stable
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.031810,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789,unstable


In [5]:
#Getting the dataset information;columns and datatypes.
grid_stability.dtypes

tau1     float64
tau2     float64
tau3     float64
tau4     float64
p1       float64
p2       float64
p3       float64
p4       float64
g1       float64
g2       float64
g3       float64
g4       float64
stab     float64
stabf     object
dtype: object

In [6]:
#Checking for missing data 
grid_stability.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [7]:
# Because of the direct relationship between stab and stabf, stab is drop and stabf will remains the sole target variabe 
grid_stability.drop('stab', axis =1, inplace = True)
grid_stability.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
tau1,2.95906,9.304097,8.971707,0.716415,3.134112,6.999209,6.710166,6.953512,4.689852,9.841496,...,5.783299,0.998988,3.114442,5.754191,2.042954,2.930406,3.392299,2.364034,9.631511,6.530527
tau2,3.079885,4.902524,8.848428,7.6696,7.608772,9.109247,3.765204,1.379125,4.007747,1.413822,...,4.726614,9.924916,4.781072,3.032743,8.514335,9.487627,1.274827,2.84203,3.994398,6.78179
tau3,8.381025,3.047541,3.046479,4.486641,4.943759,3.784066,6.929314,5.7194,1.478573,9.769856,...,1.340273,8.926563,2.427918,5.084803,8.173809,2.376523,2.954947,8.776391,2.757071,4.349695
tau4,9.780754,1.369357,1.214518,2.340563,9.857573,4.267788,8.818562,7.870307,3.733787,7.641616,...,8.617933,2.885941,7.989509,4.633624,5.466635,6.187797,6.894759,1.008906,7.821347,8.673138
p1,3.763085,5.067812,3.405158,3.963791,3.525811,4.429669,2.397419,3.224495,4.0413,4.727595,...,4.587533,3.660232,2.673156,5.19925,3.783797,3.343416,4.349512,4.299976,2.514755,3.492807
p2,-0.782604,-1.940058,-1.207456,-1.027473,-1.125531,-1.857139,-0.61459,-0.748998,-1.410344,-1.991363,...,-1.950574,-1.103521,-0.918191,-1.71703,-1.639912,-0.658054,-1.663661,-1.380719,-0.96633,-1.390285
p3,-1.257395,-1.872742,-1.27721,-1.938944,-1.845975,-0.670397,-1.208826,-1.186517,-1.238204,-0.857637,...,-1.594137,-1.105641,-0.652736,-1.713212,-0.662469,-1.449106,-0.952437,-0.943884,-0.649915,-1.532193
p4,-1.723086,-1.255012,-0.920492,-0.997374,-0.554305,-1.902133,-0.574004,-1.28898,-1.392751,-1.878594,...,-1.042822,-1.45107,-1.102228,-1.769009,-1.481417,-1.236256,-1.733414,-1.975373,-0.89851,-0.570329
g1,0.650456,0.413441,0.163041,0.446209,0.79711,0.261793,0.17789,0.371385,0.269708,0.376356,...,0.445853,0.71766,0.86795,0.157284,0.154129,0.601709,0.502079,0.487838,0.365246,0.073056
g2,0.859578,0.862414,0.766689,0.976744,0.45545,0.07793,0.397977,0.633204,0.250364,0.544415,...,0.64568,0.954919,0.888858,0.975921,0.944486,0.779642,0.567242,0.986505,0.587558,0.505441


In [8]:
#Get the count of items in the target variable column 
grid_stability.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [9]:
#Converting the target variable datatype 
grid_stability['stabf'] = (grid_stability.stabf == "unstable").astype(int)

In [116]:
grid_stability.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
tau1,2.95906,9.304097,8.971707,0.716415,3.134112,6.999209,6.710166,6.953512,4.689852,9.841496,...,5.783299,0.998988,3.114442,5.754191,2.042954,2.930406,3.392299,2.364034,9.631511,6.530527
tau2,3.079885,4.902524,8.848428,7.6696,7.608772,9.109247,3.765204,1.379125,4.007747,1.413822,...,4.726614,9.924916,4.781072,3.032743,8.514335,9.487627,1.274827,2.84203,3.994398,6.78179
tau3,8.381025,3.047541,3.046479,4.486641,4.943759,3.784066,6.929314,5.7194,1.478573,9.769856,...,1.340273,8.926563,2.427918,5.084803,8.173809,2.376523,2.954947,8.776391,2.757071,4.349695
tau4,9.780754,1.369357,1.214518,2.340563,9.857573,4.267788,8.818562,7.870307,3.733787,7.641616,...,8.617933,2.885941,7.989509,4.633624,5.466635,6.187797,6.894759,1.008906,7.821347,8.673138
p1,3.763085,5.067812,3.405158,3.963791,3.525811,4.429669,2.397419,3.224495,4.0413,4.727595,...,4.587533,3.660232,2.673156,5.19925,3.783797,3.343416,4.349512,4.299976,2.514755,3.492807
p2,-0.782604,-1.940058,-1.207456,-1.027473,-1.125531,-1.857139,-0.61459,-0.748998,-1.410344,-1.991363,...,-1.950574,-1.103521,-0.918191,-1.71703,-1.639912,-0.658054,-1.663661,-1.380719,-0.96633,-1.390285
p3,-1.257395,-1.872742,-1.27721,-1.938944,-1.845975,-0.670397,-1.208826,-1.186517,-1.238204,-0.857637,...,-1.594137,-1.105641,-0.652736,-1.713212,-0.662469,-1.449106,-0.952437,-0.943884,-0.649915,-1.532193
p4,-1.723086,-1.255012,-0.920492,-0.997374,-0.554305,-1.902133,-0.574004,-1.28898,-1.392751,-1.878594,...,-1.042822,-1.45107,-1.102228,-1.769009,-1.481417,-1.236256,-1.733414,-1.975373,-0.89851,-0.570329
g1,0.650456,0.413441,0.163041,0.446209,0.79711,0.261793,0.17789,0.371385,0.269708,0.376356,...,0.445853,0.71766,0.86795,0.157284,0.154129,0.601709,0.502079,0.487838,0.365246,0.073056
g2,0.859578,0.862414,0.766689,0.976744,0.45545,0.07793,0.397977,0.633204,0.250364,0.544415,...,0.64568,0.954919,0.888858,0.975921,0.944486,0.779642,0.567242,0.986505,0.587558,0.505441


In [10]:
# Encode the categorical data(traget variable) into numerical
encoder = OneHotEncoder(sparse = False)

stabf_encode = encoder.fit_transform(grid_stability[['stabf']])

In [11]:
grid_stability.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
tau1,2.95906,9.304097,8.971707,0.716415,3.134112,6.999209,6.710166,6.953512,4.689852,9.841496,...,5.783299,0.998988,3.114442,5.754191,2.042954,2.930406,3.392299,2.364034,9.631511,6.530527
tau2,3.079885,4.902524,8.848428,7.6696,7.608772,9.109247,3.765204,1.379125,4.007747,1.413822,...,4.726614,9.924916,4.781072,3.032743,8.514335,9.487627,1.274827,2.84203,3.994398,6.78179
tau3,8.381025,3.047541,3.046479,4.486641,4.943759,3.784066,6.929314,5.7194,1.478573,9.769856,...,1.340273,8.926563,2.427918,5.084803,8.173809,2.376523,2.954947,8.776391,2.757071,4.349695
tau4,9.780754,1.369357,1.214518,2.340563,9.857573,4.267788,8.818562,7.870307,3.733787,7.641616,...,8.617933,2.885941,7.989509,4.633624,5.466635,6.187797,6.894759,1.008906,7.821347,8.673138
p1,3.763085,5.067812,3.405158,3.963791,3.525811,4.429669,2.397419,3.224495,4.0413,4.727595,...,4.587533,3.660232,2.673156,5.19925,3.783797,3.343416,4.349512,4.299976,2.514755,3.492807
p2,-0.782604,-1.940058,-1.207456,-1.027473,-1.125531,-1.857139,-0.61459,-0.748998,-1.410344,-1.991363,...,-1.950574,-1.103521,-0.918191,-1.71703,-1.639912,-0.658054,-1.663661,-1.380719,-0.96633,-1.390285
p3,-1.257395,-1.872742,-1.27721,-1.938944,-1.845975,-0.670397,-1.208826,-1.186517,-1.238204,-0.857637,...,-1.594137,-1.105641,-0.652736,-1.713212,-0.662469,-1.449106,-0.952437,-0.943884,-0.649915,-1.532193
p4,-1.723086,-1.255012,-0.920492,-0.997374,-0.554305,-1.902133,-0.574004,-1.28898,-1.392751,-1.878594,...,-1.042822,-1.45107,-1.102228,-1.769009,-1.481417,-1.236256,-1.733414,-1.975373,-0.89851,-0.570329
g1,0.650456,0.413441,0.163041,0.446209,0.79711,0.261793,0.17789,0.371385,0.269708,0.376356,...,0.445853,0.71766,0.86795,0.157284,0.154129,0.601709,0.502079,0.487838,0.365246,0.073056
g2,0.859578,0.862414,0.766689,0.976744,0.45545,0.07793,0.397977,0.633204,0.250364,0.544415,...,0.64568,0.954919,0.888858,0.975921,0.944486,0.779642,0.567242,0.986505,0.587558,0.505441


In [12]:

x = grid_stability.drop(columns='stabf')
y = grid_stability['stabf']

# split the data into training and testing sets 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [13]:
y_train.value_counts()

1    5092
0    2908
Name: stabf, dtype: int64

In [142]:
pip install imbalanced-learn 

Note: you may need to restart the kernel to use updated packages.


In [14]:
#There is imbalance in the observe data set

from imblearn.over_sampling import SMOTE 
smote = SMOTE(random_state = 1)
x_train_bal, y_train_bal = smote.fit_resample(x_train, y_train)

In [15]:
x_train_bal.value_counts()

tau1      tau2      tau3      tau4      p1        p2         p3         p4         g1        g2        g3        g4      
0.500793  6.734360  5.284357  5.797771  3.387591  -0.752656  -0.972783  -1.662152  0.802707  0.204237  0.642260  0.513210    1
6.630654  9.745850  2.419101  2.736891  3.686849  -1.787698  -1.214773  -0.684378  0.118274  0.229779  0.604040  0.452518    1
6.623862  6.841372  4.216249  1.816077  4.581995  -1.812553  -1.388942  -1.380500  0.374776  0.127695  0.832619  0.391101    1
6.625460  8.851015  9.413108  9.330661  3.886869  -1.036351  -1.104742  -1.745777  0.378542  0.510910  0.843751  0.336192    1
6.627050  3.537235  5.915826  1.702248  2.702847  -0.612258  -0.634343  -1.456247  0.199006  0.643758  0.431489  0.214037    1
                                                                                                                            ..
3.300592  5.745264  3.595685  3.178341  3.104557  -0.581319  -0.732578  -1.790660  0.295935  0.505183  0.536511  0.5

In [16]:
y_train_bal.value_counts()

0    5092
1    5092
Name: stabf, dtype: int64

In [17]:
#normalize the datasets using standard scaler 
scaler = StandardScaler()
normalised_train_df = scaler.fit_transform(x_train_bal)
normalised_train_df = pd.DataFrame(normalised_train_df, columns= x_train_bal.columns)

x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test)
normalised_test_df = pd.DataFrame(normalised_test_df, columns = x_test.columns)

## Random Forest Classification Model 

In [18]:
#Classification With RandomForest
rmdf = RandomForestClassifier()
rmdf.fit(normalised_train_df, y_train_bal)

y_preds = rmdf.predict(normalised_test_df)

In [19]:
rmdf = RandomForestClassifier()
scores = cross_val_score(rmdf, normalised_train_df, y_train_bal, cv=5, scoring = 'f1_macro')
print(scores)
print(scores.mean()) 

[0.92095847 0.925379   0.92390587 0.95772676 0.95424616]
0.9364432532504982


In [None]:
loo = LeaveOneOut()
scores = cross_val_score(RandomForestClassifier(), normalised_train_df, y_train_bal, cv=loo, scoring = 'f1_macro')
scores.mean()

In [74]:
def metric_evaluation (model_name, y_true, y_pred):
    f1_val = f1_score(y_true, y_pred)
    print(f'F1 score for {model_name} on cross_validation: {f1_val}')
    
    Accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy score for {model_name} on cross_validation: {Accuracy}')
    
    Precision = precision_score(y_true, y_pred)
    print(f'Precision score for {model_name} on cross_validation: {Precision}')
    
    Recall = recall_score(y_true, y_pred)
    print(f'Recall score for {model_name} on cross_validation: {Recall}')

In [75]:
metric_evaluation(model_name = "RandomForestClassifier", y_true = y_test, y_pred = y_preds)

F1 score for RandomForestClassifier on cross_validation: 0.9330708661417323
Accuracy score for RandomForestClassifier on cross_validation: 0.915
Precision score for RandomForestClassifier on cross_validation: 0.9464856230031949
Recall score for RandomForestClassifier on cross_validation: 0.9200310559006211


## Extra Trees Classification 

In [76]:
etc = ExtraTreesClassifier()
scores = cross_val_score(etc, normalised_train_df, y_train_bal, cv=5, scoring = 'f1_macro')
print(scores)
print(scores.mean()) 

[0.92586788 0.94746564 0.94206552 0.95870388 0.96557817]
0.947936217897319


In [79]:
etc = ExtraTreesClassifier()
etc.fit(normalised_train_df, y_train_bal)

y_preds = etc.predict(normalised_test_df)

In [80]:
def metric_evaluation (model_name, y_true, y_pred):
    f1_val = f1_score(y_true, y_pred)
    print(f'F1 score for {model_name} on cross_validation: {f1_val}')
    
    Accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy score for {model_name} on cross_validation: {Accuracy}')
    
    Precision = precision_score(y_true, y_pred)
    print(f'Precision score for {model_name} on cross_validation: {Precision}')
    
    Recall = recall_score(y_true, y_pred)
    print(f'Recall score for {model_name} on cross_validation: {Recall}')

In [81]:
metric_evaluation(model_name = "Extra Trees Classifier", y_true = y_test, y_pred = y_preds)

F1 score for Extra Trees Classifier on cross_validation: 0.9382044306257287
Accuracy score for Extra Trees Classifier on cross_validation: 0.9205
Precision score for Extra Trees Classifier on cross_validation: 0.9392996108949416
Recall score for Extra Trees Classifier on cross_validation: 0.937111801242236


## XGboost Classifier 

In [82]:
xgboost = XGBClassifier()
scores = cross_val_score(xgboost, normalised_train_df, y_train_bal, cv=5, scoring = 'f1_macro')
print(scores)
print(scores.mean())

[0.94354356 0.95532491 0.95139743 0.97494809 0.97296929]
0.9596366542500181


In [84]:
xgboost = XGBClassifier()
xgboost.fit(normalised_train_df, y_train_bal)

y_preds = xgboost.predict(normalised_test_df)

In [85]:
def metric_evaluation (model_name, y_true, y_pred):
    f1_val = f1_score(y_true, y_pred)
    print(f'F1 score for {model_name} on cross_validation: {f1_val}')
    
    Accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy score for {model_name} on cross_validation: {Accuracy}')
    
    Precision = precision_score(y_true, y_pred)
    print(f'Precision score for {model_name} on cross_validation: {Precision}')
    
    Recall = recall_score(y_true, y_pred)
    print(f'Recall score for {model_name} on cross_validation: {Recall}')

In [86]:
metric_evaluation(model_name = "XGBClassifier", y_true = y_test, y_pred = y_preds)

F1 score for XGBClassifier on cross_validation: 0.954099646920361
Accuracy score for XGBClassifier on cross_validation: 0.9415
Precision score for XGBClassifier on cross_validation: 0.964314036478985
Recall score for XGBClassifier on cross_validation: 0.9440993788819876


## Light GBM

In [87]:
lgbtm = LGBMClassifier()
scores = cross_val_score(lgbtm, normalised_train_df, y_train_bal, cv=5, scoring = 'f1_macro')
print(scores)
print(scores.mean())

[0.94010787 0.94550174 0.94205457 0.97151159 0.96854685]
0.9535445234288987


In [88]:
lgbtm = LGBMClassifier()
lgbtm.fit(normalised_train_df, y_train_bal)

y_preds = lgbtm.predict(normalised_test_df)

In [None]:
def metric_evaluation (model_name, y_true, y_pred):
    f1_val = f1_score(y_true, y_pred)
    print(f'F1 score for {model_name} on cross_validation: {f1_val}')
    
    Accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy score for {model_name} on cross_validation: {Accuracy}')
    
    Precision = precision_score(y_true, y_pred)
    print(f'Precision score for {model_name} on cross_validation: {Precision}')
    
    Recall = recall_score(y_true, y_pred)
    print(f'Recall score for {model_name} on cross_validation: {Recall}')

In [89]:
metric_evaluation(model_name = "LGBMClassifier", y_true = y_test, y_pred = y_preds)

F1 score for LGBMClassifier on cross_validation: 0.9488993710691823
Accuracy score for LGBMClassifier on cross_validation: 0.935
Precision score for LGBMClassifier on cross_validation: 0.9609872611464968
Recall score for LGBMClassifier on cross_validation: 0.937111801242236
