In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
import pickle
from faker import Faker


In [3]:
df = pd.read_csv('/Data/synthetic_data_new.csv')
df = df.fillna('None') ## to change NaN values to "None"

In [4]:
df.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,E0000,34,,,Minor,,Pass,7,Moderate
1,E0001,91,,,,,Fail,7,Moderate
2,E0002,191,,,,,Pass,6,Low
3,E0003,143,Minor,,,,Pass,7,Moderate
4,E0004,38,,,,,Pass,6,Low


In [5]:
df=df.drop(columns=['Entity ID'])

#### Econding Categorical data to integer

In [6]:
def encoding(item):
    if item in ['Low']:
        return 0
    elif item in ['Pass', 'Moderate', 'None']:
        return 1
    elif item in ['Minor', 'Fail', 'Within past year', 'Flagged', 'High']:
        return 2
    elif item in ['Within past 1-3 years', 'Major']:
        return 3
    elif isinstance(item, (int, float)) and item < 200:
        return 1
    elif isinstance(item, (int, float)) and 200 <= item <= 500:
        return 2
    else:
        return 3

In [7]:
exclude_columns = ['Total Risk Score', 'Risk Category']
#Encoding all the columns except for risk Score & Risk result
encode_columns = [col for col in df.columns if col not in exclude_columns]

for col in encode_columns:
    df[col] = df[col].apply(encoding)

df.head()

Unnamed: 0,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,1,1,1,2,1,1,7,Moderate
1,1,1,1,1,1,2,7,Moderate
2,1,1,1,1,1,1,6,Low
3,1,2,1,1,1,1,7,Moderate
4,1,1,1,1,1,1,6,Low


#### Train Test Split

In [8]:
# Split data into training and test sets

y = df['Risk Category']
x = df.drop(columns=['Risk Category','Total Risk Score'])

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)

#### Balance Classes


In [9]:
smote = SMOTE(random_state=42)
x_train_resampled,y_train_resampled=smote.fit_resample(x_train,y_train)

#### Model Training -- Random Forest

In [10]:
# Train Random Forest model with best hyperparameters

rfc = RandomForestClassifier()
parameters = {'min_samples_leaf':[1,2,4],'min_samples_split':[2,5,100],'n_estimators':[10,20,30,100]}
rfc_cv=GridSearchCV(rfc,parameters)
rfc_cv.fit(x_train_resampled,y_train_resampled)
print('tuned hyperparameters: (best parameters)',rfc_cv.best_params_)
print('Best Parameters Accuracy score:', rfc_cv.best_score_)

tuned hyperparameters: (best parameters) {'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 30}
Best Parameters Accuracy score: 0.9880386983289359


In [11]:
# Make predictions
y_predict=rfc_cv.predict(x_test)

In [12]:
# Evaluate the model
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

        High       0.89      0.96      0.92        50
         Low       0.98      1.00      0.99       489
    Moderate       1.00      0.97      0.98       461

    accuracy                           0.98      1000
   macro avg       0.96      0.98      0.97      1000
weighted avg       0.98      0.98      0.98      1000



#### Export Model

In [13]:
model_path = '/ML Model/RiskPredictor_v3.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(rfc_cv, file)

#### Loading Model

In [14]:
with open(model_path, 'rb') as file:
    model = pickle.load(file)

#### Preparing the data to predict

In [15]:
df_dashboard = pd.read_csv('/Data/synthetic_data_dashboard.csv')
df_dashboard_copy=df_dashboard.copy()
df_dashboard_copy = df_dashboard_copy.fillna('None') ## to change NaN values to "None"

y_dash=df_dashboard_copy['Risk Category']
x_dash=df_dashboard_copy.drop(columns=['Entity ID','Risk Category','Total Risk Score'])


In [19]:
df_dashboard.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,E0000,179,,Within past year,,,Pass,8,Moderate
1,E0001,138,,,,,Pass,6,Low
2,E0002,157,,,,,Pass,6,Low
3,E0003,50,,,,Flagged,Pass,7,Moderate
4,E0004,150,,,,,Pass,6,Low


In [17]:
encode_columns = x_dash.columns  # Assuming all columns in x_dash need encoding
for col in encode_columns:
    x_dash[col] = x_dash[col].apply(encoding)

In [18]:
# Make predictions
y_predict_dash=model.predict(x_dash)


In [22]:
df_dashboard_predicted=df_dashboard.copy()
df_dashboard_predicted = df_dashboard_predicted.fillna('None')

In [23]:
df_dashboard_predicted.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category
0,E0000,179,,Within past year,,,Pass,8,Moderate
1,E0001,138,,,,,Pass,6,Low
2,E0002,157,,,,,Pass,6,Low
3,E0003,50,,,,Flagged,Pass,7,Moderate
4,E0004,150,,,,,Pass,6,Low


In [24]:

df_dashboard_predicted['Predicted Risk Category'] = y_predict_dash

In [26]:
df_dashboard_predicted.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category,Predicted Risk Category
0,E0000,179,,Within past year,,,Pass,8,Moderate,Moderate
1,E0001,138,,,,,Pass,6,Low,Low
2,E0002,157,,,,,Pass,6,Low,Low
3,E0003,50,,,,Flagged,Pass,7,Moderate,Moderate
4,E0004,150,,,,,Pass,6,Low,Low


In [27]:
# Count the occurrences of each risk category
risk_category_counts = df_dashboard_predicted['Predicted Risk Category'].value_counts()
print(risk_category_counts)

Predicted Risk Category
Low         2350
Moderate    2346
High         304
Name: count, dtype: int64


#### Add Phone number and Address 

In [28]:
# Function to generate fake phone number
def generate_phone_number():
    fake = Faker()
    return fake.phone_number()

# Function to generate fake Canadian address
def generate_canadian_address():
    fake = Faker('en_CA')
    return fake.address()

In [29]:
# Add new columns with randomly generated values
df_dashboard_predicted['Phone Number'] = [generate_phone_number() for _ in range(len(df_dashboard_predicted))]
df_dashboard_predicted['Address'] = [generate_canadian_address() for _ in range(len(df_dashboard_predicted))]

In [30]:
df_dashboard_predicted.head()

Unnamed: 0,Entity ID,Annual Clients,Infraction Type,Infraction Timeline,Public Complaints,Sentiment Analysis,Inspection Results,Total Risk Score,Risk Category,Predicted Risk Category,Phone Number,Address
0,E0000,179,,Within past year,,,Pass,8,Moderate,Moderate,001-233-810-9504,"8501 Daniels Brook\nDavidview, NU Y5X3J1"
1,E0001,138,,,,,Pass,6,Low,Low,(464)960-2406x87948,"60230 James Cliffs Apt. 348\nAntoniofort, NT J..."
2,E0002,157,,,,,Pass,6,Low,Low,732-549-1912,"82239 White Landing\nNew Tina, MB E9L 2K6"
3,E0003,50,,,,Flagged,Pass,7,Moderate,Moderate,+1-289-886-9707x178,"0773 Kathy Mission Suite 119\nTimothyfurt, QC ..."
4,E0004,150,,,,,Pass,6,Low,Low,(755)224-5481x85201,"5808 Patterson Mews\nPort Dawnshire, ON H4P3A3"


In [33]:
#Save Output Dataset
# Save to CSV
df_dashboard_predicted.to_csv('/Users/meghakatiyar/M2M_WIL5/WIL5/Data/df_dashboard_predicted.csv', index=False)