In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from lightgbm import LGBMClassifier as lgbmc

from xgboost import XGBClassifier


In [58]:
df = pd.read_csv("Data_for_UCI_named.csv")

In [59]:
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [61]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [62]:
df = df.drop('stab', axis=1)

In [63]:
scaler = StandardScaler()
label = LabelEncoder()

In [64]:
X = df.drop('stabf', axis=1)
y = df['stabf']

In [65]:
X_scaled = scaler.fit_transform(X)
y_enc = label.fit_transform(y)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=1)

In [77]:
extra_tree = ExtraTreesClassifier(random_state=1)
params = {
  "n_estimators": [100, 300, 500],
  "max_depth": [2, 4, 6],
  "min_samples_split": [2,4, 6, 8],
  "criterion": ["gini", "entropy"],
  "min_samples_leaf": [1, 2, 4, 8],
  "max_features": ["log2", "auto", None]
}

In [78]:
rnd_extra_tree = RandomizedSearchCV(extra_tree, params, verbose=1, scoring="accuracy", cv=3)

In [79]:
rnd_extra_tree.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=ExtraTreesClassifier(random_state=1),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [2, 4, 6],
                                        'max_features': ['log2', 'auto', None],
                                        'min_samples_leaf': [1, 2, 4, 8],
                                        'min_samples_split': [2, 4, 6, 8],
                                        'n_estimators': [100, 300, 500]},
                   scoring='accuracy', verbose=1)

In [83]:
rnd_extra_tree.best_estimator_.feature_importances_.argmax()

1

In [27]:
xgb = XGBClassifier(random_state=1)

In [28]:
xgb.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [29]:
y_pred = xgb.predict(X_test)

In [32]:
accuracy_score(y_test, y_pred)

0.9455

In [33]:
tree_predict = rnd_extra_tree.predict(X_test)

In [34]:
accuracy_score(y_test, tree_predict)

0.743

In [36]:
rnd_forest = RandomForestClassifier(random_state=1)

In [37]:
rnd_forest.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [38]:
rnd_pred = rnd_forest.predict(X_test)

In [39]:
accuracy_score(y_test, rnd_pred)

0.929

In [74]:
lgbc = lgbmc()

lgbc.fit(X_train, y_train)

LGBMClassifier()

In [75]:
lgbc_pred = lgbc.predict(X_test)

In [76]:
accuracy_score(y_test, lgbc_pred)

0.94