## Import the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
## Import the dataset

In [3]:
df = pd.read_csv("Data_for_UCI_named.csv")

In [4]:
# EDA
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [5]:
df.dtypes

tau1     float64
tau2     float64
tau3     float64
tau4     float64
p1       float64
p2       float64
p3       float64
p4       float64
g1       float64
g2       float64
g3       float64
g4       float64
stab     float64
stabf     object
dtype: object

In [6]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [7]:
# This confirms that there is no missing data

In [8]:
df.shape

(10000, 14)

In [9]:
# The dataset consists of 10,000 rows and 14 columns
# The predictive features are 12: 'tau1' to 'tau4'; 'p1' to 'p4', and 'g1' to 'g4'
# 'stab' and 'stabf' are the dependent variables

In [10]:
# Data Preprocessing
# Dropping 'stab' because of the direct relationship it has with 'stabf' so 'stabf' will remain as the sole dependent variable
# Label Encoding is used to convert the label ('stabf') into a numeric form

from sklearn.preprocessing import LabelEncoder

df = df.drop(columns='stab')
label_encoder = LabelEncoder()
df['stabf'] = label_encoder.fit_transform(df['stabf'])
df['stabf'].unique()

array([1, 0])

In [11]:
df
# 0 in 'stabf' column represents a linearly stable system while 1 represents a linearly unstable system

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.959060,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,1
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.781760,0
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.277210,-0.920492,0.163041,0.766689,0.839444,0.109853,1
3,0.716415,7.669600,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,1
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.797110,0.455450,0.656947,0.820923,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.930406,9.487627,2.376523,6.187797,3.343416,-0.658054,-1.449106,-1.236256,0.601709,0.779642,0.813512,0.608385,1
9996,3.392299,1.274827,2.954947,6.894759,4.349512,-1.663661,-0.952437,-1.733414,0.502079,0.567242,0.285880,0.366120,0
9997,2.364034,2.842030,8.776391,1.008906,4.299976,-1.380719,-0.943884,-1.975373,0.487838,0.986505,0.149286,0.145984,0
9998,9.631511,3.994398,2.757071,7.821347,2.514755,-0.966330,-0.649915,-0.898510,0.365246,0.587558,0.889118,0.818391,1


In [12]:
from sklearn.preprocessing import StandardScaler  

scaler = StandardScaler()

# Scale the data
scaled_df =  pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

X = df.iloc[:, :-1]
y = df.iloc[:, -1]



In [13]:
# Splitting the data using train_test_split 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)




(8000, 12)
(2000, 12)
(8000,)
(2000,)


In [14]:
# Question 1: Accuracy of the Random Forest Classifier in 4 decimal places
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Instantiating the model
random_forest_classifier = RandomForestClassifier(random_state=1)

# training the model on the training data
random_forest_classifier.fit(x_train, y_train)

# predictions
y_pred = random_forest_classifier.predict(x_test)


from sklearn.metrics import accuracy_score


# Accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print('Accuracy: %.4f' % (accuracy))


Accuracy: 0.9290


## Question 3

In [15]:
# Accuracy of the LGBMClassifier in 4 d.p

from lightgbm import LGBMClassifier

# instantiating the model
lgbm_classifier = LGBMClassifier(random_state=1)

# training the model on the training data
lgbm_classifier.fit(x_train, y_train)

# predictions
y_pred_lgbm = lgbm_classifier.predict(x_test)


# accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred_lgbm)
print('Accuracy: %.4f' % (accuracy))

Accuracy: 0.9390


## Question 12


In [16]:
#Given the confusion matrix = [[255,1380], [45, 20]] for the spam classification system

TP = 255
FP = 1380
FN = 45
Precision = TP/(TP + FP)
Recall = TP /(TP + FN)
F1_Score = (2 * Precision * Recall) / (Precision + Recall)

print('F1_Score: %.4f' % F1_Score)
                           

F1_Score: 0.2636


## Question 15

In [17]:
# Accuracy on the test set using the XGBoostClassifier

from xgboost import XGBClassifier

# instantiating the model
xgb_classifier = XGBClassifier(random_state=1)

# training the model on the training data
xgb_classifier.fit(x_train, y_train)

# predictions
y_pred_xgb = xgb_classifier.predict(x_test)

# accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred_xgb)
print('Accuracy: %.4f' % (accuracy))



Accuracy: 0.9455


## Question 20

In [18]:
# New ExtraTree Classifier with new hyperparameters

# instatiating the model
extra_trees_classifier = ExtraTreesClassifier()

#hyperparameters
N_estimators = [1000, 100, 300, 500]
min_samples_split = [2, 7, 5, 2]
min_samples_leaf = [8, 4, 6,8]
max_features = [None, None, 'auto', 'log2']

# param_grid 
param_grid = {'n_estimators': N_estimators, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'max_features': max_features}

# import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

#hyperparameter tuning
randomized_search_cv = RandomizedSearchCV(extra_trees_classifier, param_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)

#train the model
randomized_search_cv.fit(x_train, y_train)

#obtain the best hyperparameters
print(randomized_search_cv.best_estimator_)
print(randomized_search_cv.best_estimator_.min_samples_split)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
ExtraTreesClassifier(max_features=None, min_samples_leaf=4, n_estimators=500)
2


## Question 16

In [19]:
# ExtraTreesClassifier

# instantiating the model
extra_trees_classifier = ExtraTreesClassifier(random_state=1)

# training the model on the training data
extra_trees_classifier.fit(x_train, y_train)

# predictions
y_pred_extra_trees = extra_trees_classifier.predict(x_test)

# accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred_extra_trees)
print('Accuracy: %.4f' % (accuracy))

# new ExtraTreesClassifier based on new hyperparameters obtained in Question 20 
new_extra_trees_classifier = ExtraTreesClassifier(max_features=None, min_samples_leaf=6, n_estimators=300, min_samples_split=2, random_state=1)

# fit the new ExtraTreeClassifier
new_extra_trees_classifier.fit(x_train, y_train)

#predictions
y_pred_new_extra_trees = new_extra_trees_classifier.predict(x_test)

# accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred_new_extra_trees)
print('New_Accuracy: %.4f' % (accuracy))

# New Accuracy > Accuracy therefore accuracy of the new optimal model is higher 

Accuracy: 0.9280
New_Accuracy: 0.9300


## Question 2

In [20]:
# Finding the feature importance using the optimal ExtraTreeClassifier obtained in 'question 16'
def get_feature_importance(model, features):
    importance_df = pd
    importance = model.feature_importances_
    # summarize feature importance
    for i,v in enumerate(importance):
        print('Feature: %0d, %s, Score: %.5f' % (i,features.columns[i],v))
        
get_feature_importance(new_extra_trees_classifier, x_train)
#from the result tau2 (1.3839) has the highest score and p1 has the lowest score (0.00548)

Feature: 0, tau1, Score: 0.13552
Feature: 1, tau2, Score: 0.13839
Feature: 2, tau3, Score: 0.13343
Feature: 3, tau4, Score: 0.13285
Feature: 4, p1, Score: 0.00548
Feature: 5, p2, Score: 0.00740
Feature: 6, p3, Score: 0.00722
Feature: 7, p4, Score: 0.00681
Feature: 8, g1, Score: 0.10330
Feature: 9, g2, Score: 0.10753
Feature: 10, g3, Score: 0.11284
Feature: 11, g4, Score: 0.10924
