# Random Forest Model

In [2]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine

# Load data from SQLite database

In [3]:
# load data back into a dataframe from the database
#cnx = create_engine('sqlite:///mental_health.db').connect()
mental_df = pd.read_sql_table('pre_encoded_survey', 'sqlite:///mental_health.db').drop(columns=['index'])
mental_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,No,Yes,Yes,Somewhat easy,No,No,Yes,No,1,"Yes, they all did",...,Yes,Yes,Yes,Yes,1,Rarely,Sometimes,Male,United States of America,United States of America
1,Yes,Yes,No,Neither easy nor difficult,Yes,Maybe,No,No,1,I don't know,...,Yes,Yes,Yes,Yes,1,Sometimes,Sometimes,Female,United States of America,United States of America
2,Yes,I am not sure,Yes,Somewhat easy,Yes,Yes,No,Yes,1,"No, none did",...,No,No,Yes,No,1,Not applicable to me,Often,Male,United Kingdom,United Kingdom
3,I don't know,No,No,Somewhat easy,No,No,Yes,No,1,Some did,...,No,No,No,No,0,Not applicable to me,Not applicable to me,Male,United States of America,United States of America
4,Yes,Yes,Yes,Very easy,No,No,I don't know,No,1,Some did,...,Yes,Yes,Yes,Yes,1,Sometimes,Often,Female,United States of America,United States of America


In [4]:
mental_df.dtypes

0     object
1     object
2     object
3     object
4     object
5     object
6     object
7     object
8      int64
9     object
10    object
11    object
12    object
13    object
14    object
15    object
16    object
17    object
18    object
19    object
20    object
21    object
22    object
23    object
24    object
25    object
26     int64
27    object
28    object
29    object
30    object
31    object
dtype: object

# Preprocessing

In [5]:
# Generate our categorical variable list
mental_cat = mental_df.dtypes[mental_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
mental_df[mental_cat].nunique()

0     4
1     3
2     3
3     6
4     3
5     3
6     3
7     2
9     4
10    4
11    4
12    4
13    4
14    4
15    4
16    3
17    3
18    5
19    5
20    6
21    4
22    3
23    3
24    3
25    2
27    5
28    5
29    3
30    7
31    8
dtype: int64

In [6]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mental_df[mental_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names_out(mental_cat)
encode_df.head()



Unnamed: 0,0_I don't know,0_No,0_Not eligible for coverage / N/A,0_Yes,1_I am not sure,1_No,1_Yes,2_I don't know,2_No,2_Yes,...,30_United Kingdom,30_United States of America,31_Australia,31_Canada,31_Germany,31_Netherlands,31_Other,31_Sweden,31_United Kingdom,31_United States of America
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
column_headers = list(encode_df.columns.values)
print("The Column Header :", column_headers)

The Column Header : ["0_I don't know", '0_No', '0_Not eligible for coverage / N/A', '0_Yes', '1_I am not sure', '1_No', '1_Yes', "2_I don't know", '2_No', '2_Yes', "3_I don't know", '3_Neither easy nor difficult', '3_Somewhat difficult', '3_Somewhat easy', '3_Very difficult', '3_Very easy', '4_Maybe', '4_No', '4_Yes', '5_Maybe', '5_No', '5_Yes', "6_I don't know", '6_No', '6_Yes', '7_No', '7_Yes', "9_I don't know", '9_No, none did', '9_Some did', '9_Yes, they all did', '10_I was aware of some', '10_N/A (not currently aware)', '10_No, I only became aware later', '10_Yes, I was aware of all of them', "11_I don't know", '11_None did', '11_Some did', '11_Yes, they all did', "12_I don't know", '12_No', '12_Sometimes', '12_Yes, always', "13_I don't know", '13_None of them', '13_Some of them', '13_Yes, all of them', "14_I don't know", '14_No, at none of my previous employers', '14_Some of my previous employers', '14_Yes, at all of my previous employers', "15_I don't know", '15_None did', '15_S

In [8]:
# Merge one-hot encoded features and drop the originals
mental_df = mental_df.merge(encode_df,left_index=True, right_index=True)
mental_df = mental_df.drop(mental_cat,axis=1)
mental_df.head()

Unnamed: 0,8,26,0_I don't know,0_No,0_Not eligible for coverage / N/A,0_Yes,1_I am not sure,1_No,1_Yes,2_I don't know,...,30_United Kingdom,30_United States of America,31_Australia,31_Canada,31_Germany,31_Netherlands,31_Other,31_Sweden,31_United Kingdom,31_United States of America
0,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Send encoded dataframe to database to be used in model

In [9]:
# Create the engine
engine = create_engine("sqlite:///mental_health.db", echo=False)

In [10]:
# Use pandas to_sql to write records stored in a dataframe to a SQL database

# Write dataframe of all survey questions to a sqlite table
mental_df.to_sql('encoded_survey', con=engine, if_exists='replace')
#engine.execute("SELECT * FROM encoded_survey").fetchall()

861

In [11]:
# load data back into a dataframe from the database
#cnx=create_engine('sqlite:///mental_health.db').connect()
mental_df = pd.read_sql_table('encoded_survey','sqlite:///mental_health.db' ).drop(columns=['index'])
mental_df

Unnamed: 0,8,26,0_I don't know,0_No,0_Not eligible for coverage / N/A,0_Yes,1_I am not sure,1_No,1_Yes,2_I don't know,...,30_United Kingdom,30_United States of America,31_Australia,31_Canada,31_Germany,31_Netherlands,31_Other,31_Sweden,31_United Kingdom,31_United States of America
0,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
857,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
858,1,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
859,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
# load data back into a dataframe from the database
#cnx=create_engine('sqlite:///mental_health.db').connect()
questions_df = pd.read_sql_table('pre_encoded_questions', 'sqlite:///mental_health.db').drop(columns=['index'])
questions_df

Unnamed: 0,Question
0,Does your employer provide mental health benef...
1,Do you know the options for mental health care...
2,Does your employer offer resources to learn mo...
3,If a mental health issue prompted you to reque...
4,Do you think that discussing a mental health d...
5,Do you think that discussing a physical health...
6,Do you feel that your employer takes mental he...
7,Have you heard of or observed negative consequ...
8,Do you have previous employers?
9,Have your previous employers provided mental h...


In [13]:
# Define the target set.
y = mental_df["25_Yes"]

# Define the features set.
X = mental_df.drop(columns=["25_Yes","25_No"])

In [28]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=78)

In [29]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=120, random_state=78)

In [30]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [31]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)
predictions

array([0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1.,
       0., 1., 1.])

In [32]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,63,19
Actual 1,4,87


In [33]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Create a Confusion Matrix

In [35]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,63,19
Actual 1,4,87


Accuracy Score : 0.8670520231213873
Classification Report
              precision    recall  f1-score   support

         0.0       0.94      0.77      0.85        82
         1.0       0.82      0.96      0.88        91

    accuracy                           0.87       173
   macro avg       0.88      0.86      0.86       173
weighted avg       0.88      0.87      0.87       173



In [36]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.        , 0.10475854, 0.00490985, 0.0018625 , 0.00163605,
       0.00460955, 0.00379326, 0.00755137, 0.00880078, 0.00446511,
       0.00496448, 0.00373078, 0.00362311, 0.00203858, 0.00395608,
       0.00461622, 0.00188041, 0.00376936, 0.00427984, 0.00411412,
       0.00222372, 0.00387363, 0.00403156, 0.00130731, 0.00411781,
       0.00361139, 0.00428378, 0.00262896, 0.0027913 , 0.00628503,
       0.00399671, 0.00762478, 0.00290628, 0.00721046, 0.00619292,
       0.00303552, 0.00231081, 0.00116538, 0.00495796, 0.00433474,
       0.00099392, 0.0035013 , 0.00176924, 0.00262097, 0.00265305,
       0.0039971 , 0.00197536, 0.00445311, 0.00429133, 0.00130768,
       0.00310125, 0.00333436, 0.00144608, 0.00496966, 0.00404005,
       0.00497746, 0.00078484, 0.00421479, 0.00388889, 0.00171452,
       0.00412299, 0.00424284, 0.00233213, 0.00577499, 0.00207269,
       0.00101903, 0.003976  , 0.0022181 , 0.00321507, 0.00351178,
       0.00127534, 0.00374526, 0.0016143 , 0.00235803, 0.00257

In [37]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.12002808640196849, '23_Yes'),
 (0.10475854456769261, '26'),
 (0.09239405982837383, '24_Yes'),
 (0.0766701499123019, '27_Not applicable to me'),
 (0.05421403921600128, '23_No'),
 (0.04508761018347588, '28_Not applicable to me'),
 (0.03727977192921234, '24_No'),
 (0.028430779221864506, '28_Often'),
 (0.021627395168353183, '22_Yes'),
 (0.019213490551879313, '22_No'),
 (0.015950527883894648, '24_Maybe'),
 (0.014627029657275708, '23_Maybe'),
 (0.013887601797485367, '27_Rarely'),
 (0.009256993210968128, '27_Sometimes'),
 (0.008800784016239698, '1_Yes'),
 (0.00869055615342903, '28_Sometimes'),
 (0.007624781740838109, '9_Some did'),
 (0.007551371048323903, '1_No'),
 (0.007210459802404875, '10_I was aware of some'),
 (0.0062850313877000795, "9_I don't know"),
 (0.006192918902804638, '10_N/A (not currently aware)'),
 (0.005774994326416569, '18_Maybe'),
 (0.0054160356269770775, '21_Yes, I experienced'),
 (0.005355871437702762, '28_Rarely'),
 (0.005081048646796347, "22_I don't know"),
 (0.0049

In [23]:
# save our model to use later
import pickle

# Save to file in the current working directory
pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf_model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_test)

Test score: 86.11 %


Our model’s accuracy score is 86.11, meaning that it accurately predicts if an individual has a mental health disorder 86.11% of the time, based off how they answer survey questions (assuming they answer honestly). Since this model is not making a prediction that has high consequences, it is merely for the interest of an individual working in tech and wanting to know their likelihood of having (or developing) a mental health disorder, or for the interest of a tech company and wanting to know if offering certain mental health services would be of benefit to their employees, an accuracy of 86.11% is sufficient.