In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,roc_auc_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import pickle
import joblib
from sklearn import __version__ as sklearn_version


In [76]:
df = pd.read_csv('/Users/meghakatiyar/M2M_WIL5/WIL5/Data/synthetic_data_new.csv')
df = df.fillna('None') ## to change NaN values to "None"

In [77]:
df.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,E0000,34,,,Minor,,Pass,7,Moderate
1,E0001,91,,,,,Fail,7,Moderate
2,E0002,191,,,,,Pass,6,Low
3,E0003,143,Minor,,,,Pass,7,Moderate
4,E0004,38,,,,,Pass,6,Low


In [78]:
df=df.drop(columns=['Entity ID'])

#### Econding Categorical data to integer

In [79]:
def encoding(item):
    if item in ['Low']:
        return 0
    elif item in ['Pass', 'Moderate', 'None']:
        return 1
    elif item in ['Minor', 'Fail', 'Within past year', 'Flagged', 'High']:
        return 2
    elif item in ['Within past 1-3 years', 'Major']:
        return 3
    elif isinstance(item, (int, float)) and item < 200:
        return 1
    elif isinstance(item, (int, float)) and 200 <= item <= 500:
        return 2
    else:
        return 3

In [80]:
exclude_columns = ['Total Risk Score', 'Risk Category']
#Encoding all the columns except for risk Score & Risk result
encode_columns = [col for col in df.columns if col not in exclude_columns]

for col in encode_columns:
    df[col] = df[col].apply(encoding)

df.head()

Unnamed: 0,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,1,1,1,2,1,1,7,Moderate
1,1,1,1,1,1,2,7,Moderate
2,1,1,1,1,1,1,6,Low
3,1,2,1,1,1,1,7,Moderate
4,1,1,1,1,1,1,6,Low


#### Train Test Split

In [81]:
# Split data into training and test sets

y = df['Risk Category']
x = df.drop(columns=['Risk Category'])

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)

#### Balance Classes


In [82]:
x_train.head()

Unnamed: 0,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score
3413,1,1,1,1,1,1,6
1610,1,1,1,1,1,1,6
3459,1,3,1,1,1,2,9
4293,1,1,1,1,2,1,7
1039,1,1,1,1,1,1,6


In [83]:
smote = SMOTE(random_state=42)
x_train_resampled,y_train_resampled=smote.fit_resample(x_train,y_train)

#### Model Training -- Random Forest

In [115]:
# Train Random Forest model with best hyperparameters

rfc = RandomForestClassifier()
parameters = {'min_samples_leaf':[1,2,4],'min_samples_split':[2,5,100],'n_estimators':[10,20,30,100]}
rfc_cv=GridSearchCV(rfc,parameters)
rfc_cv.fit(x_train_resampled,y_train_resampled)
print('tuned hyperparameters: (best parameters)',rfc_cv.best_params_)
print('Best Parameters Accuracy score:', rfc_cv.best_score_)

tuned hyperparameters: (best parameters) {'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
Best Parameters Accuracy score: 1.0


In [116]:
# Make predictions
y_predict=rfc_cv.predict(x_test)

In [117]:
# Evaluate the model
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

        High       1.00      1.00      1.00        50
         Low       1.00      1.00      1.00       489
    Moderate       1.00      1.00      1.00       461

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



#### Export Model

In [118]:
model_path = '/Users/meghakatiyar/M2M_WIL5/WIL5/ML Model/RiskPredictor_v3.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(rfc, file)

#### Loading Model

In [119]:
model_path= '/Users/meghakatiyar/M2M_WIL5/WIL5/ML Model/RiskPredictor_v3.pkl'
with open(model_path, 'rb') as file:
    model = pickle.load(file)

#### Preparing the data to predict

In [120]:
df_dashboard = pd.read_csv('/Users/meghakatiyar/M2M_WIL5/WIL5/Data/synthetic_data_dashboard.csv')
df_dashboard = df_dashboard.fillna('None') ## to change NaN values to "None"

y_dash=df_dashboard['Risk Category']
x_dash=df_dashboard.drop(columns=['Entity ID','Risk Category'])


In [121]:
# Make predictions
y_predict=model.predict(x_dash)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.