In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('Data_for_UCI_named.csv')
dataset.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
df = dataset.drop(columns='stab')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


##### Encode the Target variable

In [4]:
#encode categorical variable
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit_transform(df['stabf'])
df['stabf'] = encoder.fit_transform(df['stabf'])

df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,1
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,0
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,1
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,1
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,1


In [5]:
encoder.classes_

array(['stable', 'unstable'], dtype=object)

In [6]:
X = df.drop(columns='stabf')
y = df['stabf']

##### Split the data to  train and test sets

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y, 
                            test_size=0.20, random_state=1,)

In [8]:
std_scaler = StandardScaler()

In [9]:
x_train_scaled = std_scaler.fit_transform(x_train, y_train)

In [10]:
x_test_scaled = std_scaler.transform(x_test)

### Random forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
random_forest_clf = RandomForestClassifier()

In [13]:
random_forest_clf.fit(x_train_scaled, y_train)

RandomForestClassifier()

###### accuracy on the test set using the random forest classifier

In [14]:
rf_pred = random_forest_clf.predict(x_test_scaled)

accuracy_score(y_test, rf_pred)

0.9205

### Extra Trees 

In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

In [17]:
extra_trees_clf = ExtraTreesClassifier()

In [18]:
extra_trees_clf.fit(x_train_scaled, y_train)

ExtraTreesClassifier()

###### Random search - extra trees

In [19]:
params = {
    'n_estimators': [100,300,500,1000],
    'min_samples_split': [2,5,7],
    'min_samples_leaf': [4,6,8],
    'max_features': ['auto', 'log2', None]
}

In [20]:
extr_rand_search = RandomizedSearchCV(ExtraTreesClassifier(), params, n_iter=10, scoring='accuracy', 
           verbose=1, random_state=1)

extr_rand_search.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(estimator=ExtraTreesClassifier(),
                   param_distributions={'max_features': ['auto', 'log2', None],
                                        'min_samples_leaf': [4, 6, 8],
                                        'min_samples_split': [2, 5, 7],
                                        'n_estimators': [100, 300, 500, 1000]},
                   random_state=1, scoring='accuracy', verbose=1)

In [21]:
extr_rand_search.best_estimator_

ExtraTreesClassifier(max_features=None, min_samples_leaf=6, min_samples_split=7)

In [23]:
type(extr_rand_search.best_estimator_)

sklearn.ensemble._forest.ExtraTreesClassifier

In [24]:
best_extra_trees_clf = extr_rand_search.best_estimator_
print(best_extra_trees_clf)

ExtraTreesClassifier(max_features=None, min_samples_leaf=6, min_samples_split=7)


In [26]:
best_extra_trees_clf.fit(x_train_scaled, y_train)

ExtraTreesClassifier(max_features=None, min_samples_leaf=6, min_samples_split=7)

In [27]:
# accuracy of optimised extra trees classifier
bet_ped = best_extra_trees_clf.predict(x_test_scaled)

accuracy_score(y_test, bet_ped)

0.928

In [28]:
# accuracy of initial extra trees classifier
extr_pred = extra_trees_clf.predict(x_test_scaled)

accuracy_score(y_test, extr_pred)

0.926

###### Extra Trees - feature importance

In [35]:
idx_most_important = np.argmax(extra_trees_clf.feature_importances_)
idx_most_important

1

In [37]:
idx_least_important = np.argmin(extra_trees_clf.feature_importances_)
idx_least_important

4

In [39]:
most_important_feature = df.columns[idx_most_important]
least_important_feature = df.columns[idx_least_important]

print(f'most_important_feature: {most_important_feature}, least_important_feature: {least_important_feature}')

most_important_feature: tau2, least_important_feature: p1


### XGBoost

In [40]:
from xgboost import XGBClassifier

xgboost_clf = XGBClassifier()

In [41]:
xgboost_clf.fit(x_train_scaled, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

###### accuracy on the test set using the XGboost classifier

In [42]:
xgb_pred = xgboost_clf.predict(x_test_scaled)

accuracy_score(y_test, xgb_pred)

0.9455

### LightGBM

In [43]:
from lightgbm import LGBMClassifier

In [44]:
lgbm_clf = LGBMClassifier()

lgbm_clf.fit(x_train_scaled, y_train)

LGBMClassifier()

###### accuracy on the test set using the LGBM classifier

In [45]:
lgbm_pred = lgbm_clf.predict(x_test_scaled)

accuracy_score(y_test, lgbm_pred)

0.9395