<a href="https://colab.research.google.com/github/Nwosu-Ihueze/Classification_Ecological_footprint/blob/main/Ecological_footprint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Classification - Managing the quality metric of global ecological footprint

Electrical grids require a balance between electricity supply and demand in order to be stable. Conventional systems achieve this balance through demand-driven electricity production. For future grids with a high share of inflexible (i.e., renewable) energy source, the concept of demand response is a promising solution. This implies changes in electricity consumption in relation to electricity price changes. In this work, we’ll build a binary classification model to predict if a grid is stable or unstable using the UCI Electrical Grid Stability Simulated dataset.

In [1]:
#Mount google drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [12]:
#Change directory to folder
%cd /content/drive/MyDrive/uci_classification/

/content/drive/MyDrive/uci_classification


In [4]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [14]:
#Read file
grid_file = pd.read_csv('Data_for_UCI_named.csv')

In [15]:
#Visualize first 10 columns
grid_file.head(10)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable
5,6.999209,9.109247,3.784066,4.267788,4.429669,-1.857139,-0.670397,-1.902133,0.261793,0.07793,0.542884,0.469931,-0.017385,stable
6,6.710166,3.765204,6.929314,8.818562,2.397419,-0.61459,-1.208826,-0.574004,0.17789,0.397977,0.402046,0.37663,0.005954,unstable
7,6.953512,1.379125,5.7194,7.870307,3.224495,-0.748998,-1.186517,-1.28898,0.371385,0.633204,0.732741,0.380544,0.016634,unstable
8,4.689852,4.007747,1.478573,3.733787,4.0413,-1.410344,-1.238204,-1.392751,0.269708,0.250364,0.164941,0.482439,-0.038677,stable
9,9.841496,1.413822,9.769856,7.641616,4.727595,-1.991363,-0.857637,-1.878594,0.376356,0.544415,0.792039,0.116263,0.012383,unstable


In [16]:
#Visualize statistical information
grid_file.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [17]:
#Visualize dataset information
grid_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [18]:
#Check for null values
grid_file.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [19]:
#Drop the 'stab' column
data_var = grid_file.drop(['stab', 'stabf'], axis = 1)
data_tar = grid_file['stabf']

In [20]:
#Import classification libraries
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

In [21]:
#Split dataset
X_train, X_test, y_train, y_test = train_test_split(data_var, data_tar, test_size=0.2, random_state=1)

In [25]:
#Feature scale dataset with standard scaler
ss = StandardScaler()

X_train = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)

X_test = pd.DataFrame(ss.transform(X_test), columns=X_test.columns)

In [27]:
#Random Forest Classification
Rf_clf = RandomForestClassifier(random_state=1)

Rf_clf.fit(X_train, y_train)

Rf_pred = Rf_clf.predict(X_test)

In [28]:
#Importing accuracy calculation library
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score, roc_curve, roc_auc_score

In [30]:
#Random Forest Classification Accuracy
acc = round(accuracy_score(y_test, Rf_pred), 4)
print(f'Accuracy: {acc}')

Accuracy: 0.929


In [31]:
#Import Extra Tree Classifier library
from sklearn.ensemble import ExtraTreesClassifier

In [32]:
#Extra Tree Classifier
Et_clf = ExtraTreesClassifier(random_state=1)

Et_clf.fit(X_train, y_train)

Et_pred = Et_clf.predict(X_test)

In [33]:
#Extra Tree Classifier Accuracy
acc = round(accuracy_score(y_test, Et_pred), 4)
print(f'Accuracy: {acc}')

Accuracy: 0.928


In [34]:
#XGBoost Classifier
Xg_clf = XGBClassifier(random_state=1)

Xg_clf.fit(X_train, y_train)

Xg_pred = Xg_clf.predict(X_test)

In [35]:
#XGBoost Classifier Accuracy
acc = round(accuracy_score(y_test, Xg_pred), 4)
print(f'Accuracy: {acc}')

Accuracy: 0.9195


In [37]:
#LGBM Classifier
Lg_clf = LGBMClassifier(random_state=1)

Lg_clf.fit(X_train, y_train)

Lg_pred = Lg_clf.predict(X_test)

In [39]:
#LGBM Classifier Accuracy
acc = round(accuracy_score(y_test, Lg_pred), 4)
print(f'Accuracy: {acc}')

Accuracy: 0.9375


In [40]:
#Importing Random seacrh grid library
from sklearn.model_selection import RandomizedSearchCV

In [45]:
#Improving extra tree classifier
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}


rdm_grid = RandomizedSearchCV(Et_clf, 
                              hyperparameter_grid,
                              scoring='accuracy',
                              n_iter=10,
                              cv=5,
                              n_jobs= -1,
                              verbose=1,
                              random_state=1)

rdm_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.6min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=ExtraTreesClassifier(bootstrap=False,
                                                  ccp_alpha=0.0,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  max_samples=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                        

In [46]:
#Showing the model with best parameter
rdm_grid.best_params_

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [51]:
#Spam classification F1 score
precision = 355/(355 + 1480)

recall = 355/(355 + 45)

f1 = 2*(precision*recall)/(precision + recall)

round(f1, 4)

0.3177

In [47]:
#Random grid search accuracy
rdm_grid_pred = rdm_grid.predict(X_test)

acc = round(accuracy_score(y_test, rdm_grid_pred), 4)
print(f'Accuracy: {acc}')

Accuracy: 0.927


In [49]:
#Creating a new extra tree classifier model and comparing with the old model
Et_clf = ExtraTreesClassifier(random_state=1)

Et_clf.fit(X_train, y_train)

old_acc = round(accuracy_score(y_test, Et_clf.predict(X_test)), 4)
print(f'Accuracy: {old_acc}')


new_acc = rdm_grid.best_estimator_

new_acc.fit(X_train, y_train)

new_acc_pred = new_acc.predict(X_test)

print(f'New Accuracy: {round(accuracy_score(y_test, new_acc_pred), 4)}')

Accuracy: 0.928
New Accuracy: 0.927
