In [1]:
#import necessary libraries

import numpy as np                  #linear algebra
import pandas as pd                 #data processing and reading in data
import matplotlib.pyplot as plt     #plotting graphs
%matplotlib inline                  
import seaborn as sns               #for visualization

In [2]:
#Question 1

TP = 355    #TP is true positives
FP = 1480   #FP is false positives
FN = 45     #FN is false negatives
TN = 120    #TN is true negatives

precision  = TP / (TP + FP)  
recall = TP / (TP + FN)    
F1_score = 2 *((precision * recall)/(precision + recall))
print(np.round(F1_score, 4))

0.3177


In [3]:
#Question 12
#entropy = - (p1log p1 + p2log p2)

In [57]:
df = pd.read_csv('dataset.csv')       #read in the data
data = df.copy()           #make a copy of data so as not to mess up the original

In [58]:
#a view of the first five columns of the data frame
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [6]:
#check the data type and missing values
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [7]:
# import all models
from sklearn.preprocessing import StandardScaler        # for features scaling
from sklearn.preprocessing import LabelEncoder          # encode categorical features to numeric values
from sklearn.model_selection import train_test_split    # for splitting into training and testing set
from sklearn.model_selection import RandomizedSearchCV  # for selecting best hyperparameter combiation
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier     # algorithm for training and testing
from xgboost import XGBClassifier                       # extreme boosting algorithm
from lightgbm import LGBMClassifier                               # extreme boosting algorithm

#evaluation metric
from sklearn.metrics import accuracy_score

In [92]:
#instantiate label encoder
le = LabelEncoder()

#create new column for encoded data
data['stabf'] = le.fit(data['stabf']).transform(data['stabf'])

In [93]:
#drop the stab column since it is directly related to the stabf column 
data.drop('stab', axis=1, inplace=True)

KeyError: "['stab'] not found in axis"

In [94]:
#instantiate Standard scaler
sc = StandardScaler()

#scale the data... scaling is done to center the data distribution around zero and standardize its standard deviation to 1
scaled = pd.DataFrame(sc.fit_transform(data), columns = data.columns)
#X_test = sc.transform(X_test)

In [95]:
# split data into independent(X) and dependent(y) variables

X = scaled.drop('stabf', axis=1)
y = data['stabf']

In [96]:
#split data into training and testing sets, where testing set is 20%(0.2) of the whole dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =1)

In [97]:
#train using random forest and extreme tree classifier

#instantiate the models
rfr = RandomForestClassifier()
extr = ExtraTreesClassifier()

#train data
rfr.fit(X_train, y_train)
extr.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [86]:
#use extreme boosting algorithms

xgb = XGBClassifier()
lgb = LGBMClassifier()

#train data
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [98]:
#predictions on test data

print('random forest classifier accuracy: ', rfr.predict(y_test))
print('extra trees classifier accuracy: ', extr.predict(y_test))
print('xgboost classifier accuracy: ', xgb.predict(y_test))
print('lightgbm classifier accuracy: ', lgb.predict(y_test))

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 0. ... 0. 1. 1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [16]:
#hyperparameters for randomized search cv
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}


In [28]:
ran = RandomizedSearchCV(estimator = extr, param_distributions = hyperparameter_grid, random_state=1)

In [29]:
ran.fit(X_train, y_train)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=ExtraTreesClassifier(bootstrap=False,
                                                  ccp_alpha=0.0,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  max_samples=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                     

In [30]:
print('best score is: ', ran.best_score_)         # best accuracy score
print('best prameters are: ', ran.best_params_) #best hyper parameter combination

best score is:  0.9235
best prameters are:  {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None}


In [31]:
best_extr = ExtraTreesClassifier(n_estimators = 1000, min_samples_split= 2, min_samples_leaf= 8, max_features= None)

In [32]:
best_extr.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features=None,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=8, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=1000,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [40]:
feature_importance = best_extr.feature_importances_   #to check features in order of importance

In [41]:
feature_importance

array([0.1374549 , 0.14016436, 0.13443813, 0.13476841, 0.00382746,
       0.00546479, 0.00536244, 0.00513378, 0.10310134, 0.10834763,
       0.11229864, 0.10963813])

In [24]:
plt.bar( feature_importance)
#plt.xlabel('columns')
plt.ylabel('feature importance')
plt.title('how important each feature are to the model')
plt.show()

TypeError: bar() missing 1 required positional argument: 'height'

In [None]:
#function to check feature importances

def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
plot_feature_importance(best_extr.get_feature_importance(),X.columns,'Extra Trees')