#  ***RandomForestClassifier***

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold,GridSearchCV,train_test_split,KFold
from sklearn.metrics import log_loss,accuracy_score
from sklearn.ensemble import  RandomForestClassifier
import matplotlib.pyplot as plt


In [None]:
cancer =pd.read_csv("Cases/Wisconsin/BreastCancer.csv")
lbl = LabelEncoder()

X = cancer.drop("Class",axis = 1)
y = pd.DataFrame(lbl.fit_transform(cancer['Class']))
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=23,test_size=0.2,stratify=y)
rtc = RandomForestClassifier(max_features=3)
rtc.fit(X_train,y_train)

#checking the probability of getting output as 1
y_pred_proba = rtc.predict_proba(X_test)[:,1]
print("Log_loss of Random Forest Classifier:",log_loss(y_test,y_pred_proba))
y_pred = rtc.predict(X_test)
print("Accuracy Score Random Forest Classifier:",accuracy_score(y_test,y_pred))

#comparing with the DecisionTree
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth = None)
dtc.fit(X_train,y_train)
y_pred_proba = dtc.predict_proba(X_test)
print("Log_loss of Decision Tree Classifier:",log_loss(y_test,y_pred_proba))
y_pred = dtc.predict(X_test)
print("Accuracy Score of Decision Tree Classifier:",accuracy_score(y_test,y_pred))

In [None]:
#Performing the GridSearchCV
rtc = RandomForestClassifier()
kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=23)
params = {'max_features':np.arange(3,7),
          #'max_depth': [3,5,None], it is not necessary
          'min_samples_split':[2,5,10],
          'min_samples_leaf':[1,3,5,7,10,15]}

gcv = GridSearchCV(rtc,param_grid=params,scoring='neg_log_loss',cv=kfold)

gcv.fit(X,y)
print("Best Parameters :",gcv.best_params_)
print("Best Score :",gcv.best_score_)

In [None]:
bm = gcv.best_estimator_

df_imp = pd.DataFrame({'variable':bm.feature_names_in_,
                       'importance':bm.feature_importances_})
df_imp = df_imp[df_imp['importance']>0.0001]
df_imp = df_imp.sort_values(by='importance')
plt.barh(df_imp['variable'],df_imp['importance'])
plt.ylabel('Importance')
plt.show()

In [None]:
#Storing the best model to cehck for manual tests
from joblib import dump
dump(bm,"Cases/Wisconsin/best_predictor.job")

In [None]:
# crating the intrface using gradio
import gradio as gr
from joblib import load

import numpy as np

def predict(Clump,UniCell_Size,Unicell_Shape,MargAdh,Septh,Bare_N,BChromatin,NoemN,Mitosis):
    tst = np.array([[Clump,UniCell_Size,Unicell_Shape,MargAdh,Septh,Bare_N,BChromatin,NoemN,Mitosis]])
    bm_loaded = load(r"/home/dai/KP_Module/Practical Machine Learning/Practise/Cases/Wisconsin/best_predictor.job")#save the model and give its path
    if bm_loaded.predict(tst)[0] == 0:
        return "Benign"
    else:
        return "malignant"


demo = gr.Interface(fn=predict,inputs=["number"]*9,outputs=['text'])
demo.launch()

# ***RandomForestRegressor***

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from sklearn.linear_model import ElasticNet

In [None]:
data = pd.read_csv("Cases/Chemical Process Data/ChemicalProcess.csv")
data.isna().sum()

In [None]:
# imputer = SimpleImputer(strategy='mean')

# X=imputer.fit_transform(data.drop('Yield',axis=1))
# X = pd.DataFrame(X,columns=data.drop('Yield',axis=1).columns)
# y  = data['Yield']

X = data.drop('Yield',axis=1)
y = data['Yield']

In [None]:
kfold = KFold(n_splits=5,shuffle=True,random_state=sane_sir)

imp_mean = SimpleImputer()
imp_med = SimpleImputer()

rfr = RandomForestRegressor()
pipe = Pipeline([("IMP",imp_mean),("RFR",rfr)])
params = {
    'RFR__max_features':[3,4,5,6],
    'IMP__strategy':['mean','median']
}
gcv = GridSearchCV(pipe,param_grid=params,cv=kfold,scoring='r2')
gcv.fit(X,y)

print("best parameters for RandomForest Regressor ", gcv.best_params_ )
print("best score for RandomForest Regressor ", gcv.best_score_ )

In [None]:
bgr = BaggingRegressor()
est = ElasticNet()
pipe = Pipeline([("IMP",imp_mean),("BGR",bgr)])


params = {
    'BGR__estimator':[est],
    'BGR__n_estimators':[10,15,20,25,30],
    'BGR__max_features':[3,4,5,6],
    'IMP__strategy':['mean','median']
}

gcv = GridSearchCV(pipe,param_grid=params,cv=kfold,scoring='r2')
gcv.fit(X,y)

print("best parameters for elastic net ", gcv.best_params_ )
print("best score for elastic ent ", gcv.best_score_ )
