# Stage C: Tag-Along Project

#### Binary classification model to predict if a grid is stable or unstable using the UCI Electrical Grid Stability Simulated dataset.

In [2]:
import pandas as pd

In [3]:
# load data 
df = pd.read_csv('Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [4]:
# shape of data
df.shape

(10000, 14)

In [5]:
# check for missing numbers
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [6]:
# summary of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [7]:
# check for duplicated rows
df.duplicated().sum()

0

In [8]:
# summary description of the data
df.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


#### Because of the direct relationship between 'stab' and 'stabf' ('stabf' = 'stable' if 'stab' <= 0, 'unstable' otherwise), 'stab' will be dropped and 'stabf' will remain as the sole dependent variable (binary classification).

In [9]:
# drop stab column
df = df.drop('stab', axis=1)
df.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stabf'],
      dtype='object')

In [10]:
# converting categorical columns to numerical variables for training
# This is done because the XGBoost classifier kept showing ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['stable' 'unstable']

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

cat_cols = ['stabf']

#load the LabelEncoder
labelencoder = LabelEncoder()
df[cat_cols] = df[cat_cols].apply(lambda col: labelencoder.fit_transform(col))

#Load OneHotEncoder
onehotencoder = OneHotEncoder()
array_hot_encoded = onehotencoder.fit_transform(df[cat_cols])
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=df.index)

data_other_cols = df.drop(columns = cat_cols)
data_out = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [11]:
# set the independent variables (x) and dependent variable (y)
x = df.drop(columns = 'stabf')
y = df['stabf']

In [13]:
# split data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2 , random_state= 1) 

In [14]:
# normalize the data using standard scaler
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()

# fit the scaler to the data, transform the data and create a dataframe with the scaled data
normalised_trainset = pd.DataFrame(scalar.fit_transform(x_train), columns=x_train.columns)
normalised_testset = pd.DataFrame(scalar.fit_transform(x_test), columns=x_test.columns)

In [15]:
# shape of train and test set
print(normalised_trainset.shape, normalised_testset.shape)

(8000, 12) (2000, 12)


## Question 14
What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a decision tree classifier object
clf = RandomForestClassifier(random_state=1)

# Train the classifier on the training data
clf.fit(normalised_trainset, y_train)

# Make predictions on the testing data
y_pred = clf.predict(normalised_testset)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy, 4))

Accuracy: 0.928


## Question 15
What is the accuracy on the test set using the XGboost classifier? In 4 decimal places.

In [17]:
# Extreme Boosting Model

from xgboost import XGBClassifier

xg = XGBClassifier(random_state=1)

# Train the classifier on the training data
xg.fit(normalised_trainset, y_train)

# Predict the labels of the testing data
xgb_pred = xg.predict(normalised_testset)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, xgb_pred)

print("Accuracy: ", round(accuracy, 4))

Accuracy:  0.946


## Question 16
What is the accuracy on the test set using the LGBM classifier? In 4 decimal places.

In [18]:
# light gradient boosting model

from lightgbm import LGBMClassifier

lgb = LGBMClassifier(random_state=1)

# Train the classifier on the training data
lgb.fit(normalised_trainset, y_train)

# Predict the labels of the testing data
lgb_pred = lgb.predict(normalised_testset)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, lgb_pred)

print("Accuracy: ", round(accuracy, 4))

Accuracy:  0.9365


## Question 17
Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [20]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
et = ExtraTreesClassifier(random_state = 1)

# define the search space for hyperparameters
parameters = {'n_estimators': [50, 100, 300, 500, 1000],
              'min_samples_split': [2, 3, 5, 7, 9],
              'min_samples_leaf': [1, 2, 4, 6, 8],
              'max_features': ['sqrt', 'log2', 'auto', None]}

Randomized_search = RandomizedSearchCV(estimator = et, param_distributions= parameters , random_state = 1, n_iter=10, cv=5, scoring="accuracy",
                                   n_jobs = -1, verbose = 1)

# fit the randomized search object to the data
output = Randomized_search.fit(normalised_trainset, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [21]:
# print the best combination of hyperparameters
output.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

## Question 18
Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

## Question 20
Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

## Question 2

In [22]:
# find f1 score
TP = 255
FN = 1380
FP = 45

Recall = TP/(TP + FN)
Precision = TP/(TP + FP)
F1_Score = 2*((Precision*Recall) / (Precision + Recall))

print(F1_Score)

0.2635658914728682
