In [1]:
#import required libraries
import pandas as pd

In [2]:
#load dataset and view first five rows
df = pd.read_csv('Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
#drop stabf column
df = df.drop(columns=['stab'])

In [4]:
#check for missing values 
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stabf    0
dtype: int64

In [5]:
#segment your dataset into feature and target variables
features = df.drop(columns=['stabf'])
target = df['stabf']

In [6]:
#import module for splitting data
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)

In [8]:
#normalize dataset to a common scale using the min max scaler
from sklearn.preprocessing import MinMaxScaler

#create a MinMaxScaler Object
scaler = MinMaxScaler()
normalised_train_df = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

X_test = X_test.reset_index(drop=True)
normalised_test_df = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# RandomForest Classifier

In [9]:
# Fitting Random Forest Classifier to the dataset 
from sklearn.ensemble import RandomForestClassifier 
  
# create a RandomForestClassifier object 
regressor = RandomForestClassifier(random_state = 1) 
  
# fit the model to the training dataset 
regressor.fit(X_train, y_train)   

RandomForestClassifier(random_state=1)

In [10]:
#obtain predictions
predictions1 = regressor.predict(X_test)

In [11]:
#import metric modules
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

## Question 1

What is the accuracy of the test set using the random forest classifier? In 4 decimal places.

In [12]:
#Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred= predictions1)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9290


In [13]:
#precision
precision = precision_score(y_true=y_test, y_pred=predictions1, pos_label='stable')
print(f'Precision: {precision:.4f}')

Precision: 0.9191


In [14]:
#recall
recall = recall_score(y_true=y_test, y_pred=predictions1, pos_label='stable')
print(f'Recall: {recall:.4f}')

Recall: 0.8778


In [15]:
#f1_score
f1 = f1_score(y_true=y_test, y_pred=predictions1, pos_label='stable')
print(f'F1: {f1:.4f}') 

F1: 0.8980


### Cross-Validation

In [16]:
#Cross Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, X_train, y_train, cv=5, scoring='f1_macro')
scores

array([0.91143756, 0.91136454, 0.91564855, 0.90214725, 0.91555674])

### Confusion Matrix

In [17]:
cnf_mat = confusion_matrix(y_true=y_test, y_pred=predictions1, labels=['stable', 'unstable'])
cnf_mat

array([[ 625,   87],
       [  55, 1233]], dtype=int64)

### Classification Report for Random Forest Classifier

In [18]:
#you can also use the classification_report to obtain a better summary of the metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions1))

              precision    recall  f1-score   support

      stable       0.92      0.88      0.90       712
    unstable       0.93      0.96      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.92      0.92      2000
weighted avg       0.93      0.93      0.93      2000



# ExtraTrees Classifier

In [19]:
from sklearn.ensemble import ExtraTreesClassifier 
  
# create ExtraTreesClassifier object 
Ext_clf = ExtraTreesClassifier(random_state = 1) 
  
# fit the regressor with x and y train data 
Ext_clf.fit(X_train, y_train)  

ExtraTreesClassifier(random_state=1)

In [20]:
#obtain predictions
predictions2 = Ext_clf.predict(X_test)

In [21]:
#Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred= predictions2)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9280


### Cross-Validation 

In [22]:
#Cross Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(Ext_clf, X_train, y_train, cv=5, scoring='f1_macro')
scores

array([0.91104788, 0.90890596, 0.91695487, 0.91218763, 0.92433757])

### Confusion Matrix

In [23]:
cnf_mat = confusion_matrix(y_true=y_test, y_pred=predictions2, labels=['stable', 'unstable'])
cnf_mat

array([[ 606,  106],
       [  38, 1250]], dtype=int64)

### Classification Report for Extra Trees Classifier

In [24]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions2))

              precision    recall  f1-score   support

      stable       0.94      0.85      0.89       712
    unstable       0.92      0.97      0.95      1288

    accuracy                           0.93      2000
   macro avg       0.93      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



# XGB Classifier

In [25]:
from xgboost import XGBClassifier

In [26]:
# fit model to training data
model = XGBClassifier(random_state=1)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [27]:
#run predictions
predictions3 = model.predict(X_test)

## Question 2

What is the accuracy on the test set using the xgboost classifier? In 4 decimal places.

In [28]:
#Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred= predictions3)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9455


### Cross-Validation 

In [29]:
#Cross Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
scores

array([0.9366625 , 0.94470459, 0.93293748, 0.93646215, 0.94718274])

### Confusion Matrix

In [30]:
cnf_mat = confusion_matrix(y_true=y_test, y_pred=predictions3, labels=['stable', 'unstable'])
cnf_mat

array([[ 648,   64],
       [  45, 1243]], dtype=int64)

### Classification Report for XGB Classifier

In [31]:
print(classification_report(y_test, predictions3))

              precision    recall  f1-score   support

      stable       0.94      0.91      0.92       712
    unstable       0.95      0.97      0.96      1288

    accuracy                           0.95      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.95      0.95      0.95      2000



# LGBM Classifier

In [32]:
from lightgbm import LGBMClassifier

In [33]:
# fit model to training data
lgb= LGBMClassifier(random_state= 1)
lgb.fit(X_train, y_train)

LGBMClassifier(random_state=1)

In [34]:
#run predictions
predictions4 = lgb.predict(X_test)

## Question 3

What is the accuracy on the test set using LGBM classifier? in 4 decimal places.

In [35]:
#Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred= predictions4)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9375


### Cross-Validation

In [36]:
#Cross Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lgb, X_train, y_train, cv=5, scoring='f1_macro')
scores

array([0.92843386, 0.94817937, 0.92658942, 0.92809053, 0.94694022])

### Confusion Matrix

In [37]:
cnf_mat = confusion_matrix(y_true=y_test, y_pred=predictions4, labels=['stable', 'unstable'])
cnf_mat

array([[ 636,   76],
       [  49, 1239]], dtype=int64)

### Classification Report for LGBM Classifier

In [38]:
print(classification_report(y_test, predictions4))

              precision    recall  f1-score   support

      stable       0.93      0.89      0.91       712
    unstable       0.94      0.96      0.95      1288

    accuracy                           0.94      2000
   macro avg       0.94      0.93      0.93      2000
weighted avg       0.94      0.94      0.94      2000



# Randomized Search CV

In [39]:
from sklearn.model_selection import RandomizedSearchCV

## Question 4

To improve the ExtraTreesClassifier, you will use the following parameters (number of estimators, minimum number of samples, minimum number of samples for leaf node and the number of features to consider when looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV)

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'max_features': max_features}
                
Using the ExtraTreesClassifier as your estimator with cv =5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state =1. What are the best hyperparameters from the RandomizedSearchCV?

In [40]:
#specify parameters that will be used to find the best hyperparemeters 
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]
hyperparameter_grid = {'n_estimators': n_estimators, 'min_samples_split': min_samples_split, 
                      'min_samples_leaf': min_samples_leaf, 'max_features': max_features}

In [41]:
rand= RandomizedSearchCV(Ext_clf, 
                         param_distributions=hyperparameter_grid, 
                         cv=5, 
                         scoring= 'accuracy', 
                         n_iter = 10,
                         n_jobs = -1,
                         verbose = 1,
                         random_state = 1)

In [42]:
#train model
rand.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.1min finished


RandomizedSearchCV(cv=5, estimator=ExtraTreesClassifier(random_state=1),
                   n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 3, 5, 7, 9],
                                        'n_estimators': [50, 100, 300, 500,
                                                         1000]},
                   random_state=1, scoring='accuracy', verbose=1)

In [43]:
#The best hyperparameters values is provided by the code below
print(rand.best_params_)

{'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


## Question 5

Train a new ExtraTreesClassifier Model with the new hyperparameters from the RandomizedSearchCV (with random_state =1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [44]:
#selecting the best parameters
Ext_clf = ExtraTreesClassifier(n_estimators=1000,
                               min_samples_split= 2, 
                               min_samples_leaf = 8,
                               max_features= None, 
                               random_state = 1) 

In [45]:
# train extra trees clasifier with hyperparameters discovered
Ext_clf.fit(X_train, y_train)

ExtraTreesClassifier(max_features=None, min_samples_leaf=8, n_estimators=1000,
                     random_state=1)

In [46]:
#run predictions
predictions5 = Ext_clf.predict(X_test)

In [47]:
#Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred= predictions5)
print(f'Accuracy: {accuracy:.4f}')
#from the results, the accuracy is lower

Accuracy: 0.9270


## Question 6

Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

In [48]:
#Ext_clf.feature_importances_ after using best parameters
Ext_clf.feature_importances_

array([0.13723979, 0.14050787, 0.1346805 , 0.13541662, 0.00368361,
       0.0053368 , 0.00542927, 0.0049625 , 0.10256224, 0.1075776 ,
       0.11306257, 0.10954062])

In [49]:
#from above, 0.14050787 and 0.003686 are the most and least important features and this corresponds to tau2 and p1 features respectively