# Predicting Mental Health Performance
We create models to predict mental health performance using other features.

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import roc_curve, roc_auc_score

In [4]:
# use cleaned dataset
df = pd.read_csv('data/CitieSHealth_BCN_DATA_PanelStudy_20220414_Clean.csv')
df.head()

In [5]:
numerical = list((df.dtypes[df.dtypes == 'float64'].index) | (df.dtypes[df.dtypes == 'int64'].index))
categorical = list((df.dtypes[df.dtypes != 'float64'].index) & (df.dtypes[df.dtypes != 'int64'].index))

In [15]:
def ohe(data, column):
    enc = OneHotEncoder()
    enc.fit(data[column])
    encoded_data = pd.DataFrame(enc.transform(data[column]).toarray().astype(int))
    encoded_data.columns = enc.get_feature_names_out()
    encoded_data = encoded_data.set_index(data.index)
    return encoded_data

In [20]:
ohe_df = pd.concat([df[numerical], ohe(df[categorical], categorical)], axis=1)
ohe_df.head()

In [22]:
X = ohe_df.drop('performance', axis=1)
y = ohe_df['performance']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Linear Regression

In [25]:
model = LinearRegression().fit(X_train, y_train)

In [32]:
print('model intercept :', model.intercept_)
print('model coefficients : ', model.coef_)
print('Model score : ', model.score(X, y))

In [28]:
y_pred = model.predict(X_test)

In [29]:
mean_squared_error(y_test, y_pred)

## Random Forest

In [71]:
model = RandomForestRegressor(n_estimators=100, max_depth=4, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [72]:
mean_squared_error(y_test, y_pred)

In [75]:
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
tree.plot_tree(model.estimators_[0]);