In [1]:
import pandas as pd
import numpy as np

In [2]:
glass = pd.read_csv('glass.csv')

In [3]:
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [4]:
glass.isnull().sum()

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

# Random Forest

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix

In [6]:
X = glass.drop(columns=['Type'], axis=1)
y = glass['Type']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
79,1.5159,12.82,3.52,1.9,72.86,0.69,7.97,0.0,0.0
161,1.51934,13.64,3.54,0.75,72.65,0.16,8.89,0.15,0.24
109,1.51818,13.72,0.0,0.56,74.45,0.0,10.99,0.0,0.0
127,1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0.0,0.17
95,1.5186,13.36,3.43,1.43,72.26,0.51,8.6,0.0,0.0


In [9]:
y_train.head()

79     2
161    3
109    2
127    2
95     2
Name: Type, dtype: int64

In [10]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

RandomForestClassifier()

In [11]:
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [12]:
print(accuracy)

0.8372093023255814


### Random Forest Accuracy: 83.7%

# Random Search

In [13]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
# specify distributions to sample from
param_dist = {'n_estimators': list(range(100, 300, 10)),
              'min_samples_leaf': list(range(1, 50)),
              'max_depth': list(range(2, 20)),
              'max_features': ['auto', 'sqrt'],
              'bootstrap': [True, False]}

# specify number of search iterations
n_iter_search = 50

# Instantiate RandomSearchCV
model_random_search = RandomizedSearchCV(estimator=rf_model,
                                         param_distributions=param_dist,
                                         n_iter=n_iter_search)


# Fit the selected model
model_random_search.fit(X_train, y_train)

RandomizedSearchCV(estimator=RandomForestClassifier(), n_iter=50,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                             17, 18, 19, 20, 21,
                                                             22, 23, 24, 25, 26,
                                                             27, 28, 29, 30, ...],
                                        'n_estimators': [100, 110, 120, 130,
     

In [15]:
y_pred_random = model_random_search.predict(X_test)
accuracy_random = accuracy_score(y_test, y_pred_random)

In [16]:
print(accuracy_random)

0.8604651162790697


### Random Search Accuracy: 86%

In [38]:
cv_results_random = pd.DataFrame(model_gridsearch.cv_results_)

print(cv_results_random.loc[cv_results_random['rank_test_score'] == 1, 'params'])

print(model_random_search.best_params_)

109    {'bootstrap': True, 'max_depth': 10, 'max_feat...
Name: params, dtype: object
{'n_estimators': 230, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 18, 'bootstrap': True}


In [39]:
print(model_random_search.best_estimator_)

RandomForestClassifier(max_depth=18, min_samples_leaf=2, n_estimators=230)


# Grid Search

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
# Define the grid
param_grid = {'n_estimators': [50, 100, 200, 300],
              'min_samples_leaf': [1, 5, 10],
              'max_depth': [2, 4, 6, 8, 10],
              'max_features': ['auto', 'sqrt'],
              'bootstrap': [True, False]}

# Instantiate GridSearchCV
model_gridsearch = GridSearchCV(estimator=rf_model,
                                param_grid=param_grid,
                                scoring='accuracy',
                                n_jobs=4,
                                cv=5,
                                refit=True,
                                return_train_score=True
                                )


# Fit the selected model
model_gridsearch.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [2, 4, 6, 8, 10],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 5, 10],
                         'n_estimators': [50, 100, 200, 300]},
             return_train_score=True, scoring='accuracy')

In [42]:
y_pred_grid = model_gridsearch.predict(X_test)
accuracy_grid = accuracy_score(y_test, y_pred_grid)

In [43]:
print(accuracy_grid)

0.8372093023255814


### Grid Search Accuracy: 83.7%

In [45]:
model_gridsearch.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [2, 4, 6, 8, 10],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 5, 10],
                         'n_estimators': [50, 100, 200, 300]},
             return_train_score=True, scoring='accuracy')>

In [46]:
cv_results_df = pd.DataFrame(model_gridsearch.cv_results_)
print(cv_results_df.loc[cv_results_df['rank_test_score'] == 1, 'params'])
print(model_gridsearch.best_params_)

86    {'bootstrap': True, 'max_depth': 8, 'max_featu...
98    {'bootstrap': True, 'max_depth': 10, 'max_feat...
Name: params, dtype: object
{'bootstrap': True, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200}


In [47]:
print(model_gridsearch.best_estimator_)


RandomForestClassifier(max_depth=8, max_features='sqrt', n_estimators=200)
