In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Data_for_UCI_named.csv')

In [3]:
df['stabf'].unique()

array(['unstable', 'stable'], dtype=object)

In [4]:
df

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,0.049860,unstable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,0.023892,unstable
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,-0.025803,stable
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,-0.031810,stable
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,0.037789,unstable


In [5]:
df_first = df.drop(columns='stab')

In [6]:
from sklearn.model_selection import train_test_split

X = df_first.drop(columns = 'stabf')
y= df_first['stabf']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier




In [9]:
# create instances for all four algorithms

r_classifier = RandomForestClassifier(random_state=1)
extratrees= ExtraTreesClassifier(random_state=1)
xgb = XGBClassifier(random_state=1)
lgbm = LGBMClassifier(random_state=1)

In [12]:
from sklearn.metrics import accuracy_score, classification_report

In [13]:
# train each model with the X_train_scaled features and the y_train labels

r_classifier.fit(X_train_scaled, y_train)
extratrees.fit(X_train_scaled, y_train)
xgb.fit(X_train_scaled, y_train)
lgbm.fit(X_train_scaled, y_train)

LGBMClassifier(random_state=1)

In [36]:
rforest_score = accuracy_score(y_test, r_classifier.predict(X_test_scaled))
print('accuracy score for RandomForest is {}'.format(rforest_score))

accuracy score for RandomForest is 0.929


In [37]:
xgb_score = accuracy_score(y_test, xgb.predict(X_test_scaled))
print('accuracy score for XGBoost is {}'.format(xgb_score))

accuracy score for XGBoost is 0.9455


In [39]:
lgbm_score = accuracy_score(y_test, lgbm.predict(X_test_scaled))
print('accuracy score for LGBMclassifier is {}'.format(lgbm_score))

accuracy score for LGBMclassifier is 0.9375


In [17]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators =[50, 100, 300, 500, 1000]
min_samples_split = [2,3,5,7,9]
min_samples_leaf =[1,2,4,6,8]
max_features = ['auto', 'sqrt', 'log2', None]

grid= {'n_estimators':n_estimators, 
       'min_samples_split':min_samples_split, 
       'min_samples_leaf':min_samples_leaf, 
       'max_features':max_features}

randomsearch_1 = RandomizedSearchCV(ExtraTreesClassifier(), grid, 
                                  cv=5, n_iter = 10, scoring= 'accuracy', 
                                  n_jobs = -1, verbose = 1, random_state=1)

In [18]:
randomsearch_1.fit(X_train_scaled, y_train).best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.2min finished


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

#### The best parameters are listed above

In [19]:
# do a random search with just the grid and random state is equal to 1

from sklearn.model_selection import RandomizedSearchCV

n_estimators =[50, 100, 300, 500, 1000]
min_samples_split = [2,3,5,7,9]
min_samples_leaf =[1,2,4,6,8]
max_features = ['auto', 'sqrt', 'log2', None]

grid= {'n_estimators':n_estimators, 
       'min_samples_split':min_samples_split, 
       'min_samples_leaf':min_samples_leaf, 
       'max_features':max_features}

randomsearch_2 = RandomizedSearchCV(ExtraTreesClassifier(), grid, random_state=1)

In [20]:
optimised_model = randomsearch_2.fit(X_train_scaled,y_train)

In [21]:
accuracy_score(y_test, optimised_model.predict(X_test_scaled))

0.927

In [22]:
accuracy_score(y_test, extratrees.predict(X_test_scaled))

0.928

**The accuracy_score for the optimized model is lower than the ordinary extratrees model**  

$ 0.927 < 0.928 $

In [23]:
optimised_model.best_estimator_.feature_importances_

array([0.13781783, 0.13995627, 0.13387734, 0.13527902, 0.00372314,
       0.00534333, 0.00533867, 0.00508831, 0.10304473, 0.10846463,
       0.11214794, 0.10991877])

In [29]:
Feat_importance = pd.DataFrame()
Feat_importance['features'] = X_test.columns
Feat_importance['importance'] = optimised_model.best_estimator_.feature_importances_
Feat_importance

Unnamed: 0,features,importance
0,tau1,0.137818
1,tau2,0.139956
2,tau3,0.133877
3,tau4,0.135279
4,p1,0.003723
5,p2,0.005343
6,p3,0.005339
7,p4,0.005088
8,g1,0.103045
9,g2,0.108465


In [32]:
Feat_importance.sort_values('importance').reset_index().iloc[[0,-1], :]

Unnamed: 0,index,features,importance
0,4,p1,0.003723
11,1,tau2,0.139956


**The feature with the highest and lowest feature importance are the 'tau2' and the 'p1' respectively**