In [2]:
#Importing the libraries needed to process the data
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Reading the data from the file location
data = pd.read_csv('Data_for_UCI_named.csv')

In [4]:
#This will display the first three rows of the dataset
data.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable


In [19]:
#Check the dataset for missing values in each of the columns
data.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [20]:
#Dimension of the dataset
data.shape

(10000, 14)

In [21]:
#Checking for duplicate rows
data.duplicated().any()

False

### Because of the direct relationship between 'stab' and 'stabf' ('stabf = 'stable if 'stab'<=0, 'unstable' otherwise), 'stab' should be dropped and 'stabf' will remain as the sole depedent variable (binary classification)


In [22]:
data.drop('stab',axis=1,inplace=True)
data.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable


In [23]:
data.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [24]:
#Seperate the dataset into attributes and target, that is the X and y variables

X = data.drop('stabf',axis=1)
y = data['stabf']

In [25]:
#Split the dataset into 80-20 train/test split with a random state of "1"
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [8]:
#Use StanderScaler to transform the train_set and the test_set
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns)
normalized_x_test = pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)

In [26]:
#Importing  all the classifiers required and the metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import recall_score, classification_report, accuracy_score,precision_score,f1_score

## Question_4
## My_Answer = {'n_estimators': 1000,
## 'min_samples_split': 2,
## 'min_samples_leaf': 8,
## 'max_features': None}

In [27]:
tree = ExtraTreesClassifier(random_state=1)
tree.fit(normalized_x_train, y_train)
tree_pred = tree.predict(normalized_x_test)
print(classification_report(y_test,tree_pred,digits=4))

              precision    recall  f1-score   support

      stable     0.9410    0.8511    0.8938       712
    unstable     0.9218    0.9705    0.9455      1288

    accuracy                         0.9280      2000
   macro avg     0.9314    0.9108    0.9197      2000
weighted avg     0.9287    0.9280    0.9271      2000



In [28]:
n_estimators = [50,100,300,500,1000]
min_samples_split = [2,3,5,7,9]
min_samples_leaf = [1,2,4,6,8]
max_features = ['auto','sqrt','log2',None]
hyperparameter_grid = {'n_estimators': n_estimators,
                    'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                    'max_features': max_features}

In [29]:
from sklearn.model_selection import RandomizedSearchCV
rsv = RandomizedSearchCV(estimator=tree,param_distributions=hyperparameter_grid, cv=5, scoring='accuracy',n_iter=10, n_jobs=-1, verbose=1,random_state=1)
search = rsv.fit(normalized_x_train, y_train)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

## Question_9
## My_Answer= tau2, p1 

In [30]:
# this function returns the importance of every features
def get_feature_importance(model,feat,col_name):
    importance = pd.Series(model.feature_importances_,feat.columns).sort_values()
    importance_df = pd.DataFrame(importance).reset_index()
    importance_df.columns = ['Features',col_name]
    importance_df[col_name].round(3)
    return importance_df

In [16]:
feature_importance = get_feature_importance(tree_best, normalized_x_train,'Feature_Importance')
feature_importance

Unnamed: 0,Features,Feature_Importance
0,p1,0.003683
1,p4,0.004962
2,p2,0.005337
3,p3,0.005429
4,g1,0.102562
5,g2,0.107578
6,g4,0.109541
7,g3,0.113063
8,tau3,0.13468
9,tau4,0.135417


## Question_14
## My_Answer = 0.9195

In [31]:
xgb = XGBClassifier(random_state=1,learning_rate = 0.1, max_depth=3,eval_metric='error')
xgb.fit(normalized_x_train, y_train)
xgb_pred = xgb.predict(normalized_x_test)
print(classification_report(y_test,xgb_pred,digits=4))

              precision    recall  f1-score   support

      stable     0.9206    0.8469    0.8822       712
    unstable     0.9190    0.9596    0.9389      1288

    accuracy                         0.9195      2000
   macro avg     0.9198    0.9033    0.9105      2000
weighted avg     0.9195    0.9195    0.9187      2000



## Question_16
## My_Answer = Lower  because, tree_accuracy = 0.9280 and tree_best_accuracy = 0.9270 and taking the difference we have 0.001. Thus its best not to tune paramaters when accuracy is high

In [32]:
tree_best = ExtraTreesClassifier(**search.best_params_, random_state=1)
tree_best.fit(normalized_x_train,y_train)
tree_pred = tree_best.predict(normalized_x_test)
print(classification_report(y_test,tree_pred,digits=4))

              precision    recall  f1-score   support

      stable     0.9211    0.8694    0.8945       712
    unstable     0.9300    0.9589    0.9442      1288

    accuracy                         0.9270      2000
   macro avg     0.9256    0.9141    0.9193      2000
weighted avg     0.9268    0.9270    0.9265      2000



## Question_17
## My_Answer = 0.9395

In [36]:
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(normalized_x_train, y_train)
lgbm_pred = lgbm.predict(normalized_x_test)
print(classification_report(y_test,lgbm_pred,digits=4))

              precision    recall  f1-score   support

      stable     0.9276    0.9003    0.9138       712
    unstable     0.9458    0.9612    0.9534      1288

    accuracy                         0.9395      2000
   macro avg     0.9367    0.9307    0.9336      2000
weighted avg     0.9393    0.9395    0.9393      2000



## Question_18
## My_Answer = 0.9290

In [37]:
forest = RandomForestClassifier(random_state=1)
forest.fit(normalized_x_train,y_train)
rf_pred = forest.predict(normalized_x_test)
print(classification_report(y_test,rf_pred,digits=4))

              precision    recall  f1-score   support

      stable     0.9191    0.8778    0.8980       712
    unstable     0.9341    0.9573    0.9456      1288

    accuracy                         0.9290      2000
   macro avg     0.9266    0.9176    0.9218      2000
weighted avg     0.9288    0.9290    0.9286      2000



## Question_19
## My_Answer = f1_score: 0.2636

In [35]:
tp = 255
fp = 1380
fn = 45
tn = 20
precision= (tp/(tp+fp))
recall= (tp/(tp+fn))
F1 = 2*(precision*recall)/(precision+recall)
print('f1_score:',F1)
round(F1,4)

f1_score: 0.2635658914728682


0.2636