In [15]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.compose import ColumnTransformer

In [16]:
# Read the data into a Pandas dataframe
mental_df = pd.read_csv('updated_dataset.csv')

In [17]:
# Define the target set.
y = mental_df["If yes, what condition(s) have you been diagnosed with?"]
# Define the features set.
X = mental_df[["Are you self-employed?",
"Do you work remotely?",
"Have you had a mental health disorder in the past?",
"Do you believe your productivity is ever affected by a mental health issue?",
"Do you have a family history of mental illness?",
"do you feel comfortable in your working environment?",
"do  you feel comfortable working  with your direct supervisor(s)?",
"Do you feel that your organisation takes mental health as seriously as physical health?",
"Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?",
"are you stressed about your career?",
"Have you ever sought treatment for a mental health issue from a mental health professional?",
"How willing would you be to share with friends and family about your work stress?",
"Do you currently have a mental health disorder?",
"Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?",
"Did you feel that your previous employers took mental health as seriously as physical health?",
"Have your previous employers provided mental health benefits?"]]

In [18]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [19]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [20]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [21]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)
predictions

array(['Anxiety Disorder (Generalized, Social, Phobia, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)',
       'Mood Disorder (Depression, Bipolar Disorder, etc)|Anxiety Disorder (Generalized, Social, Phobia, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)',
       'Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)',
       'Anxiety Disorder (Generalized, Soc

In [22]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [23]:
# Displaying results
#print("Confusion Matrix")
#display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 0.6768802228412256
Classification Report
                                                                                                                                                                                                                                                                                                                                             precision    recall  f1-score   support

                                                                                                                                                                                                                                                                                        Anxiety Disorder (Generalized, Social, Phobia, etc)       0.27      0.21      0.24        14
                                                                                                                                                                                                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.02203178, 0.04382448, 0.06459949, 0.00936977, 0.06242606,
       0.06009015, 0.06223694, 0.07331248, 0.03768279, 0.07546252,
       0.03940467, 0.04631497, 0.23334503, 0.03863499, 0.07679593,
       0.05446795])

In [25]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2333450318947317, 'Do you currently have a mental health disorder?'),
 (0.07679593468132673,
  'Did you feel that your previous employers took mental health as seriously as physical health?'),
 (0.07546252371237082, 'are you stressed about your career?'),
 (0.07331248155385242,
  'Do you feel that your organisation takes mental health as seriously as physical health?'),
 (0.06459948833258827, 'Have you had a mental health disorder in the past?'),
 (0.06242605529222838, 'Do you have a family history of mental illness?'),
 (0.06223694318901732,
  'do  you feel comfortable working  with your direct supervisor(s)?'),
 (0.060090151213880574,
  'do you feel comfortable in your working environment?'),
 (0.05446794521476302,
  'Have your previous employers provided mental health benefits?'),
 (0.04631497175046902,
  'How willing would you be to share with friends and family about your work stress?'),
 (0.043824476211274055, 'Do you work remotely?'),
 (0.03940466592320366,
  'Have you ever 

In [26]:
# save our model to use later
import pickle

# Save to file in the current working directory
pkl_filename = "random_forest_pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf_model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_test)

Test score: 67.69 %


In [27]:
import joblib

In [28]:
# Save the trained model to disk
joblib.dump(rf_model, 'rf_model.joblib')

# Check the size of the saved model file
import os
model_size = os.path.getsize('rf_model.joblib')
print(f"Model size: {model_size} bytes")

Model size: 56913697 bytes
