In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

MEAO = pd.read_excel('G:\Priyabrata\ML\Material data repository.xlsx')
INPUT=MEAO.iloc[:,2:11]  #all INPUT (Variables)
TARGET=MEAO.iloc[:,20]   #all PHASES SS,SS+IM,AM
#TARGET = MEAO.iloc[:,19]

Y=MEAO.iloc[:,22] #coded phases. 0:SS, 1:SS+IM, 2:AM
#Y = MEAO.iloc[:,23] #coded phases. 0:MSS, 1:SSS, 2:SS+IM, 3:AM


selected_INPUT = INPUT.drop(['Mixing entropy','γ','VEC','Pauling electronegativity','Molar volume dispersity'],axis=1)
selected_INPUT

normalized_INPUT=(selected_INPUT-selected_INPUT.mean())/selected_INPUT.std()

In [2]:
#Random Forest for supervised learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(normalized_INPUT, Y, test_size = 0.30)  #test data size is 30% of total size. #principalDataframe contains 3 PCs and Y contains coded phases (0:SS, 1:SS+IM, 2:AM)

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
pred_rfc = rfc.predict(X_test)

# Predictions and Evaluations
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, pred_rfc))

print(classification_report(y_test, pred_rfc))
metrics.accuracy_score(y_test,pred_rfc)   #our accuracy is 70-81%.  one of the best

[[43  3  0]
 [ 3 14  0]
 [ 0  0 11]]
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        46
           1       0.82      0.82      0.82        17
           2       1.00      1.00      1.00        11

    accuracy                           0.92        74
   macro avg       0.92      0.92      0.92        74
weighted avg       0.92      0.92      0.92        74



0.918918918918919

In [3]:
#Stratified k fold cross validation (for imbalanced datasets)
from numpy import mean
from numpy import std
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
nskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1)
model = RandomForestClassifier()
# evaluate model
scores = cross_val_score(model, normalized_INPUT, Y, scoring='accuracy', cv=nskf, n_jobs=-1)
scores
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))



Accuracy: 0.868 (0.049)


In [4]:
#Tuning Rf classifier through grid search
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [5,10,20,30,40,50,60,70]



#minimum sample split
min_samples_split= [2,3,4,5,6,7]
min_samples_leaf = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]
max_depth = [1,2,3,4,5,6,7,8,9,10]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'max_depth': max_depth}
pprint(random_grid)

{'min_samples_leaf': [3, 4, 5, 6],
 'min_samples_split': [3, 4, 5, 6, 7],
 'n_estimators': [5, 10, 20, 30, 40, 50, 60]}


In [6]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()

# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_grid = GridSearchCV(rf, param_grid=random_grid,verbose=10, cv=nskf, scoring="f1_macro",n_jobs = -1)
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1000, cv = skf, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_grid.fit(normalized_INPUT,Y)
rf_grid.best_params_

Fitting 50 folds for each of 140 candidates, totalling 7000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0309s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0611s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1708s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  

{'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 40}

In [10]:
model_best = rf_grid
scores = cross_val_score(model_best, normalized_INPUT, Y, scoring='accuracy',verbose=2, cv=nskf, n_jobs=-1)
scores
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  25 | elapsed: 26.6min remaining:  2.3min


Accuracy: 0.872 (0.044)


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 33.2min finished


In [9]:
#Repeated stratified k fold
from sklearn.model_selection import RepeatedStratifiedKFold

nskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
rm = RandomForestClassifier(n_estimators=30, max_depth=6, max_leaf_nodes=10, min_samples_split=6,min_samples_leaf=3,max_features='auto', bootstrap=True, n_jobs=-1)

scores = cross_val_score(rm, normalized_INPUT, Y, scoring='f1_macro', verbose=4,cv=nskf, n_jobs=-1)
scores
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  17 out of  25 | elapsed:    0.7s remaining:    0.3s


Accuracy: 0.842 (0.042)


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.9s finished
