# Data Modeling
---

In [7]:
# --------------------
# Import Libraries
# --------------------
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

sns.set_theme()

### Reading The Cleaned Dataset

In [8]:
# --------------
# Read The Data
# --------------
data = pd.read_csv('../data/processed/mentalhealthData_Cleaned.csv')

In [9]:
data.head()

Unnamed: 0,gender,country,occupation,self_employed,family_mh_history,sought_treatment,days_spent_indoors,noticed_growing_stress,noticed_habit_changes,personal_mh_history,mood_swings,coping_difficulty,work_engagement,social_difficulty,disclose_mh_to_employer,care_options_awareness,self_employed_missing
0,Female,United States,Corporate,Unknown,No,Yes,1-14 Days,Yes,No,Yes,Medium,No,No,Yes,No,Not Sure,1
1,Female,United States,Corporate,Unknown,Yes,Yes,1-14 Days,Yes,No,Yes,Medium,No,No,Yes,No,No,1
2,Female,United States,Corporate,Unknown,Yes,Yes,1-14 Days,Yes,No,Yes,Medium,No,No,Yes,No,Yes,1
3,Female,United States,Corporate,No,Yes,Yes,1-14 Days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes,0
4,Female,United States,Corporate,No,Yes,Yes,1-14 Days,Yes,No,Yes,Medium,No,No,Yes,No,Yes,0


### One-Hot Encoding

In [10]:
categorical_cols = [
    'gender', 'country', 'occupation', 'self_employed',
    'family_mh_history', 'sought_treatment',
    'days_spent_indoors', 'noticed_growing_stress',
    'noticed_habit_changes', 'personal_mh_history',
    'mood_swings', 'coping_difficulty',
    'work_engagement', 'social_difficulty',
    'disclose_mh_to_employer', 'care_options_awareness'
]

In [11]:
ohe = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore'
)

X_ohe = ohe.fit_transform(data[categorical_cols])

In [12]:
X_ohe = pd.DataFrame(
    X_ohe,
    columns=ohe.get_feature_names_out(categorical_cols),
    index=data.index
)

In [13]:
X_ohe.head()

Unnamed: 0,gender_Female,gender_Male,country_Australia,country_Belgium,country_Bosnia And Herzegovina,country_Brazil,country_Canada,country_Colombia,country_Costa Rica,country_Croatia,...,work_engagement_Yes,social_difficulty_Maybe,social_difficulty_No,social_difficulty_Yes,disclose_mh_to_employer_Maybe,disclose_mh_to_employer_No,disclose_mh_to_employer_Yes,care_options_awareness_No,care_options_awareness_Not Sure,care_options_awareness_Yes
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [17]:
X_with_flag = pd.concat(
    [X_ohe, data[['self_employed_missing']]],
    axis=1
)


In [18]:
X_without_flag = X_ohe.copy()

In [19]:
print(X_with_flag.shape)
print(X_without_flag.shape)

X_with_flag.head()

(290051, 81)
(290051, 80)


Unnamed: 0,gender_Female,gender_Male,country_Australia,country_Belgium,country_Bosnia And Herzegovina,country_Brazil,country_Canada,country_Colombia,country_Costa Rica,country_Croatia,...,social_difficulty_Maybe,social_difficulty_No,social_difficulty_Yes,disclose_mh_to_employer_Maybe,disclose_mh_to_employer_No,disclose_mh_to_employer_Yes,care_options_awareness_No,care_options_awareness_Not Sure,care_options_awareness_Yes,self_employed_missing
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0
