In [2]:
pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


In [28]:
# load data
df = pd.read_csv('Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [29]:
# Separate independent and dependent variable, and drop stab
y = df['stabf'].map({'unstable':0, 'stable':1})
X = df.drop(['stabf', 'stab'], axis=1)

Split the data into an 80-20 train-test split with a random state of “1”. Use the standard scaler to transform the train set (x_train, y_train) and the test set (x_test). Use scikit learn to train a random forest and extra trees classifier. And use xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model. Use random_state = 1 for training all models and evaluate on the test set

In [33]:
# Split the data into an 80-20 train-test split with a random state of “1”
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2 , random_state= 1)
y_train.value_counts()

0    5092
1    2908
Name: stabf, dtype: int64

In [34]:
y_test.value_counts()

0    1288
1     712
Name: stabf, dtype: int64

In [37]:
#standard scaler
scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Question 14

In [38]:
# train a random forest classifier
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train_scaled, y_train)

In [39]:
# accuracy score for random forest classifier
accuracy_score(y_true=y_test, y_pred=rf.predict(X_test_scaled))

0.9295

### Question 15

In [40]:
# xgboost classifier
xgb = XGBClassifier(random_state=1)
xgb.fit(X_train_scaled, y_train)
accuracy_score(y_test, xgb.predict(X_test_scaled))

0.9455

### Question 16

In [42]:
#LGBM Classifier
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(X_train_scaled, y_train)
accuracy_score(y_test, lgbm.predict(X_test_scaled))

0.9375

### Question 17

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV

In [45]:
extra = ExtraTreesClassifier(random_state=1)
params = {'n_estimators':[100, 300, 500, 1000], 'min_samples_split':[2, 5, 7], 'min_samples_leaf':[4, 6, 8], 'max_features':['log2', None, 'auto']}

Randomized_search_extra = RandomizedSearchCV(extra, param_distributions=params, n_iter=10 , cv=5, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)
Randomized_search_extra.fit(X_train_scaled, y_train)
Randomized_search_extra.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 6,
 'max_features': None}

In [47]:
print(f'Best tuned model: {accuracy_score(y_test, Randomized_search_extra.predict(X_test_scaled))}')

Best tuned model: 0.932


### Question 18

In [48]:
#initial ExtraTreesClassifier model with no hyperparameter tuning
extra_classifier = ExtraTreesClassifier(random_state=1)
extra_classifier.fit(X_train_scaled, y_train)
print(f'Initial model: {accuracy_score(y_test, extra_classifier.predict(X_test_scaled))}')

#Best tuned ExtraTreesClassifier model
best_extra_classifier = ExtraTreesClassifier(n_estimators=1000, min_samples_split=5, min_samples_leaf=6, max_features=None, random_state=1)
best_extra_classifier.fit(X_train_scaled, y_train)
print(f'Best tuned model: {accuracy_score(y_test, best_extra_classifier.predict(X_test_scaled))}')

Initial model: 0.9285
Best tuned model: 0.932


The accuracy of the new optimal model is higher than the initial ExtraTreesClassifier model with no hyperparameter tuning

### Question 19

What other hyperparameter optimization method can you try apart from Random Search?<br>
- GridSearchCV

In [52]:
# GridSearchCV for hyperparameter optimization
extra = ExtraTreesClassifier(random_state=1)
params = {'n_estimators':[100, 300, 500, 1000], 'min_samples_split':[2, 5, 7], 'min_samples_leaf':[4, 6, 8], 'max_features':['log2', None, 'auto']}

Grid_search_extra = GridSearchCV(extra, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
Grid_search_extra.fit(X_train_scaled, y_train)
Grid_search_extra.best_params_

Fitting 5 folds for each of 108 candidates, totalling 540 fits


{'max_features': None,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 500}

In [53]:
accuracy_score(y_test, Grid_search_extra.predict(X_test_scaled))

0.9335

### Question 20

Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

In [55]:
best_extra_classifier.feature_importances_

array([0.13546158, 0.13842146, 0.13312751, 0.13396645, 0.00535433,
       0.00743871, 0.00728074, 0.00687441, 0.10306406, 0.10798251,
       0.11231919, 0.10870905])

In [59]:
pd.DataFrame(best_extra_classifier.feature_importances_,
             index=X_train.columns, 
             columns=['Feature importance']).sort_values(by=['Feature importance'])

Unnamed: 0,Feature importance
p1,0.005354
p4,0.006874
p3,0.007281
p2,0.007439
g1,0.103064
g2,0.107983
g4,0.108709
g3,0.112319
tau3,0.133128
tau4,0.133966


tau2 has the highest feature importance while p1 has the lowest feature importance.