In [1]:
# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report

import pandas as pd
import numpy as np

In [2]:
# Handle Warnings
import warnings
warnings.filterwarnings('ignore')  

In [4]:
# create dataframes
X_train = pd.read_csv("/kaggle/input/machine-parts/X_train.csv")
X_test = pd.read_csv("/kaggle/input/machine-parts/X_test.csv")
y_train = pd.read_csv("/kaggle/input/machine-parts/y_train.csv")
y_test = pd.read_csv("/kaggle/input/machine-parts/y_test.csv")

In [6]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)

log_train = round(logreg.score(X_train, y_train) * 100, 2)
log_accuracy = round(accuracy_score(y_pred_lr, y_test) * 100, 2)


print("Training Accuracy    :",log_train ,"%")
print("Model Accuracy Score :",log_accuracy ,"%")
print("\033[1m--------------------------------------------------------\033[0m")
print("Logistic Regression Classification_Report: \n",classification_report(y_test,y_pred_lr))
print("\033[1m--------------------------------------------------------\033[0m")

Training Accuracy    : 96.71 %
Model Accuracy Score : 97.15 %
[1m--------------------------------------------------------[0m
Logistic Regression Classification_Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      1933
           1       0.89      0.36      0.52        22
           2       0.00      0.00      0.00         7
           3       0.83      0.36      0.50        14
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00        19

    accuracy                           0.97      2000
   macro avg       0.45      0.29      0.33      2000
weighted avg       0.96      0.97      0.96      2000

[1m--------------------------------------------------------[0m


In [7]:
# Support Vector Machines
svc = SVC()
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)

svc_train = round(svc.score(X_train, y_train) * 100, 2)
svc_accuracy = round(accuracy_score(y_pred_svc, y_test) * 100, 2)

print("Training Accuracy    :",svc_train ,"%")
print("Model Accuracy Score :",svc_accuracy ,"%")
print("\033[1m--------------------------------------------------------\033[0m")
print("SVC Classification_Report: \n",classification_report(y_test,y_pred_svc))
print("\033[1m--------------------------------------------------------\033[0m")


Training Accuracy    : 96.52 %
Model Accuracy Score : 96.65 %
[1m--------------------------------------------------------[0m
SVC Classification_Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1933
           1       0.00      0.00      0.00        22
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00        19

    accuracy                           0.97      2000
   macro avg       0.16      0.17      0.16      2000
weighted avg       0.93      0.97      0.95      2000

[1m--------------------------------------------------------[0m


In [5]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train.values.ravel())
y_pred_rf = random_forest.predict(X_test)
random_forest.score(X_train, y_train)

random_forest_train = round(random_forest.score(X_train, y_train.values.ravel()) * 100, 2)
random_forest_accuracy = round(accuracy_score(y_pred_rf, y_test.values.ravel()) * 100, 2)

print("Training Accuracy    :",random_forest_train ,"%")
print("Model Accuracy Score :",random_forest_accuracy ,"%")
print("\033[1m--------------------------------------------------------\033[0m")
print("Random Forest Classification_Report: \n",classification_report(y_test,y_pred_rf))
print("\033[1m--------------------------------------------------------\033[0m")

Training Accuracy    : 100.0 %
Model Accuracy Score : 99.6 %
[1m--------------------------------------------------------[0m
Random Forest Classification_Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1933
           1       0.95      0.95      0.95        22
           2       1.00      0.86      0.92         7
           3       0.87      0.93      0.90        14
           4       0.00      0.00      0.00         5
           5       1.00      1.00      1.00        19

    accuracy                           1.00      2000
   macro avg       0.80      0.79      0.80      2000
weighted avg       0.99      1.00      0.99      2000

[1m--------------------------------------------------------[0m


#### Random Forest performs best. It is still not accurately predicting class four, but it is doing well in all other classes.

## 5. Hyperparameter Search

In [10]:
from sklearn.model_selection import RandomizedSearchCV

In [15]:
print('Parameters currently in use:\n')
print(random_forest.get_params())

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [20]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [2, 4]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [10, 64, 118, 173, 227, 282, 336, 391, 445, 500], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [21]:
rf_RandomGrid = RandomizedSearchCV(estimator = random_forest, param_distributions = random_grid,
                                   cv = 10, verbose=2, n_jobs = 4)

In [22]:
rf_RandomGrid.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

In [23]:
rf_RandomGrid.best_params_

{'n_estimators': 64,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 4,
 'bootstrap': True}

In [25]:
# Random Forest
random_forest = RandomForestClassifier(**rf_RandomGrid.best_params_)
random_forest.fit(X_train, y_train.values.ravel())
y_pred_rf = random_forest.predict(X_test)
random_forest.score(X_train, y_train)

random_forest_train = round(random_forest.score(X_train, y_train.values.ravel()) * 100, 2)
random_forest_accuracy = round(accuracy_score(y_pred_rf, y_test.values.ravel()) * 100, 2)

print("Training Accuracy    :",random_forest_train ,"%")
print("Model Accuracy Score :",random_forest_accuracy ,"%")
print("\033[1m--------------------------------------------------------\033[0m")
print("Random Forest Classification_Report: \n",classification_report(y_test,y_pred_rf))
print("\033[1m--------------------------------------------------------\033[0m")

Training Accuracy    : 99.38 %
Model Accuracy Score : 99.4 %
[1m--------------------------------------------------------[0m
Random Forest Classification_Report: 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1933
           1       1.00      0.86      0.93        22
           2       1.00      0.71      0.83         7
           3       1.00      0.86      0.92        14
           4       0.00      0.00      0.00         5
           5       0.95      1.00      0.97        19

    accuracy                           0.99      2000
   macro avg       0.82      0.74      0.78      2000
weighted avg       0.99      0.99      0.99      2000

[1m--------------------------------------------------------[0m
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   2.5s
[CV] END bootstrap=False, max_depth=2, max_features=sqrt, min_samples_leaf=2, min_samples_s

### Hypertuned model performs worse. It is possible that random search does not work well with highly imbalanced data.