### **Importing Libraries**

In [44]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Classification Models
from xgboost import  XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
#Model Evaluation
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from sklearn.model_selection import RandomizedSearchCV
#Preprocessing Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

### **Importing Dataset**

In [5]:
data = pd.read_csv('StageC_project/Data_for_UCI_named.csv')

In [6]:
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


### **Exploratory Data Analysis**

In [7]:
#Checking the shape of the dataset
data.shape

(10000, 14)

In [8]:
#Checking for missing values
data.isnull().sum().sum()

0

In [10]:
#General information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [11]:
## Columns of the dataset
data.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stab', 'stabf'],
      dtype='object')

In [12]:
## Target variable distribution
data['stabf'].value_counts(normalize = True)

unstable    0.638
stable      0.362
Name: stabf, dtype: float64

In [9]:
#Descriptive Analysis of the dataset
data.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


### **Data Cleaning**

In [13]:
#Encoding the target variable Unstable = 0, Stable = 1
target_dict = {'unstable': 0, 'stable': 1}
data['stabf'] = data['stabf'].map(target_dict)

In [15]:
data.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,1
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,0


In [19]:
X = data.drop(['stab', 'stabf'], axis = 1)
y = data['stabf']

### **Dataset Splitting**

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 12), (2000, 12), (8000,), (2000,))

### **Data Scaling**

In [26]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
X_train_scaled

array([[ 0.36732671, -0.98604156,  0.65044706, ...,  0.33985949,
         0.58556788,  0.49223946],
       [-0.06465869,  0.08943734,  1.03507899, ..., -1.5584875 ,
         1.42964862, -1.44352101],
       [-1.46785   ,  1.29841758, -0.50253617, ...,  1.45153362,
        -1.04574277,  0.49248925],
       ...,
       [ 0.65760851, -0.72275633, -1.4058879 , ...,  0.29310048,
        -1.55058661,  0.81034412],
       [-0.05931596, -1.26053241, -1.01047147, ..., -0.38825455,
        -0.72678059,  1.66791568],
       [-1.47321368,  0.63843757,  0.25012249, ..., -1.17410957,
         1.179282  ,  0.78362657]])

In [28]:
X_test_scaled

array([[ 0.59395058, -0.41273345,  1.50392381, ...,  1.1672034 ,
        -1.50732963,  1.08472557],
       [ 0.2021896 ,  0.37441634, -0.18880047, ..., -0.39566024,
         1.41465051,  1.22601069],
       [-1.079044  , -0.31374544, -0.88463426, ..., -1.43849538,
         0.65182081, -1.6821675 ],
       ...,
       [ 0.94782488, -1.66372653, -1.65391963, ...,  0.12639128,
         0.57344494,  1.31934985],
       [-1.1202346 ,  0.19397855, -0.2378051 , ...,  0.79408717,
        -1.36232268, -0.80197116],
       [-1.37764025,  1.51186671,  0.28265058, ..., -0.91749729,
         0.00295027,  1.18902334]])

# **Model Fitting**

## **RandomForest**

In [30]:
model_rf = RandomForestClassifier(random_state = 1)
model_rf.fit(X_train_scaled, y_train)
pred_rf = model_rf.predict(X_test_scaled)

In [33]:
print(f'Train Accuracy : {accuracy_score(y_train, model_rf.predict(X_train_scaled))}')
print(f'Test Accuracy : {accuracy_score(y_test, pred_rf)}')

Train Accuracy : 1.0
Test Accuracy : 0.9295


## **Extra Tree Classifier**

In [34]:
model_et = ExtraTreesClassifier(random_state = 1)
model_et.fit(X_train_scaled, y_train)
pred_et = model_et.predict(X_test_scaled)

In [35]:
print(f'Train Accuracy : {accuracy_score(y_train, model_et.predict(X_train_scaled))}')
print(f'Test Accuracy : {accuracy_score(y_test, pred_et)}')

Train Accuracy : 1.0
Test Accuracy : 0.9285


## **XGBoost Classifier**

In [36]:
model_xgb = XGBClassifier(random_state = 1)
model_xgb.fit(X_train_scaled, y_train)
pred_xgb = model_xgb.predict(X_test_scaled)

In [37]:
print(f'Train Accuracy : {accuracy_score(y_train, model_xgb.predict(X_train_scaled))}')
print(f'Test Accuracy : {accuracy_score(y_test, pred_xgb)}')

Train Accuracy : 0.951875
Test Accuracy : 0.9195


## **LGBMClassifier**

In [38]:
model_lgb = LGBMClassifier(random_state = 1)
model_lgb.fit(X_train_scaled, y_train)
pred_lgb = model_lgb.predict(X_test_scaled)

In [39]:
print(f'Train Accuracy : {accuracy_score(y_train, model_lgb.predict(X_train_scaled))}')
print(f'Test Accuracy : {accuracy_score(y_test, pred_lgb)}')

Train Accuracy : 0.99825
Test Accuracy : 0.9375


# **Hyperparameter tuning**

In [48]:
# Number of trees in random forest
n_estimators = range(100, 1000, 100)
# Number of features to consider at every split
max_features = ['auto', 'log2', None]
# Minimum number of samples required to split a node
min_samples_split = [2,3, 5, 7]
# Minimum number of samples required at each leaf node
min_samples_leaf = [4,6,8,10]


random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

In [49]:
model = ExtraTreesClassifier(random_state = 1)

clf = RandomizedSearchCV(model, random_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 , random_state = 1)
search = clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [50]:
clf.best_params_

{'max_features': None,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 500}

## **New ExtraTreeClassifier Model (Tuned)**

In [51]:
model_etn = ExtraTreesClassifier(max_features = None, min_samples_leaf =  4, min_samples_split =  2, n_estimators=  500, random_state = 1)
model_etn.fit(X_train_scaled, y_train)
pred_etn = model_et.predict(X_test_scaled)

In [52]:
print(f'Train Accuracy : {accuracy_score(y_train, model_etn.predict(X_train_scaled))}')
print(f'Test Accuracy : {accuracy_score(y_test, pred_etn)}')

Train Accuracy : 0.99925
Test Accuracy : 0.9285


## **Feature Importance**

### **Extra Tree Classifier**

In [43]:
pd.DataFrame(model_et.feature_importances_, index  = X.columns, columns = ['feature_importance']).sort_values('feature_importance', ascending = False)

Unnamed: 0,feature_importance
tau2,0.118445
tau1,0.117397
tau4,0.115466
tau3,0.113169
g3,0.096883
g4,0.094019
g2,0.093676
g1,0.089783
p3,0.040706
p4,0.040579
