### GOAL- goal is to use data from a mental health survey to explore factors that may cause individuals to experience depression.

## Notes:
    A number of data artifacts have been left in the synthetic dataset.
    This is not a particularly difficult dataset to model. It may be interesting to focus on different ways to visualize the dataset.

In [16]:
import pandas as pd

In [17]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [18]:
def cleaning(df):
    df.drop(columns=['City','id','Name','Profession','Family History of Mental Illness','Degree','Academic Pressure',	'Work Pressure',	'CGPA'	,'Study Satisfaction'	,'Job Satisfaction'],inplace=True)
    df['age_binned']=pd.cut(df['Age'], bins=[18, 25, 35, 45, 55, 65], 
                            labels=['20s', '30s', '40s', '50s', '60s'], 
                            include_lowest=True)
    df.drop(columns=['Age'],inplace=True)
    return df


In [19]:
cleaning(df_train)
cleaning(df_test)

df_test = df_test[df_test['Dietary Habits'].isin(['Moderate', 'Unhealthy', 'Healthy']) == True]
df_test = df_test[df_test['Sleep Duration'].isin(['Less than 5 hours', '7-8 hours', 'More than 8 hours']) == True]

df_train = df_train[df_train['Dietary Habits'].isin(['Moderate', 'Unhealthy', 'Healthy']) == True]
df_train = df_train[df_train['Sleep Duration'].isin(['Less than 5 hours', '7-8 hours', 'More than 8 hours']) == True]

In [20]:
target = 'Depression'

scaling_columns=['Work/Study Hours','Financial Stress']

one_hot_columns = [
    'Working Professional or Student', 'Dietary Habits'
]


In [21]:
df_train.head()

Unnamed: 0,Gender,Working Professional or Student,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Depression,age_binned
0,Female,Working Professional,More than 8 hours,Healthy,No,1.0,2.0,0,50s
1,Male,Working Professional,Less than 5 hours,Unhealthy,Yes,7.0,3.0,1,30s
3,Male,Working Professional,Less than 5 hours,Moderate,Yes,10.0,1.0,1,20s
6,Male,Working Professional,7-8 hours,Moderate,No,6.0,2.0,0,50s
7,Male,Working Professional,7-8 hours,Unhealthy,No,10.0,3.0,0,40s


In [22]:
df_test.head()

Unnamed: 0,Gender,Working Professional or Student,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,age_binned
0,Male,Working Professional,Less than 5 hours,Moderate,No,9.0,3.0,50s
1,Female,Working Professional,Less than 5 hours,Moderate,No,6.0,4.0,60s
2,Male,Working Professional,7-8 hours,Moderate,Yes,12.0,4.0,50s
3,Female,Student,More than 8 hours,Moderate,Yes,10.0,4.0,20s
4,Male,Working Professional,7-8 hours,Moderate,Yes,3.0,4.0,50s


## Cleaning done, now scaling and label 

In [23]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder

mm=MinMaxScaler()
le=LabelEncoder()
oe=OneHotEncoder(drop='first',sparse_output=False)

for i in scaling_columns:
    df_train[i]=mm.fit_transform(df_train[i].values.reshape(-1,1))
    df_test[i]=mm.transform(df_test[i].values.reshape(-1,1))


In [24]:
encoder_train=oe.fit_transform(df_train[one_hot_columns])
encoder_test=oe.transform(df_test[one_hot_columns])
encoded_train_df = pd.DataFrame(encoder_train, columns=oe.get_feature_names_out(one_hot_columns), index=df_train.index)
encoded_test_df = pd.DataFrame(encoder_test, columns=oe.get_feature_names_out(one_hot_columns), index=df_test.index)
df_train = df_train.drop(columns=one_hot_columns).join(encoded_train_df)
df_test = df_test.drop(columns=one_hot_columns).join(encoded_test_df)

In [25]:
df_train.head()

Unnamed: 0,Gender,Sleep Duration,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Depression,age_binned,Working Professional or Student_Working Professional,Dietary Habits_Moderate,Dietary Habits_Unhealthy
0,Female,More than 8 hours,No,0.083333,0.25,0,50s,1.0,0.0,0.0
1,Male,Less than 5 hours,Yes,0.583333,0.5,1,30s,1.0,0.0,1.0
3,Male,Less than 5 hours,Yes,0.833333,0.0,1,20s,1.0,1.0,0.0
6,Male,7-8 hours,No,0.5,0.25,0,50s,1.0,1.0,0.0
7,Male,7-8 hours,No,0.833333,0.5,0,40s,1.0,0.0,1.0


In [26]:
def transform(df):
    df['Gender'] = df['Gender'].map({'Female': 1, 'Male': 0})
    df['Have you ever had suicidal thoughts ?'] = df['Have you ever had suicidal thoughts ?'].map({'No': 0, 'Yes': 1})
    df['Sleep Duration'] = df['Sleep Duration'].map({'Less than 5 hours': 0, '7-8 hours': 1, 'More than 8 hours': 2})
    df['age_binned'] = df['age_binned'].map({'20s': 0, '30s': 1, '40s': 2, '50s': 3,'60s':4})

In [27]:
transform(df_train)
transform(df_test)

In [28]:
df_train.head()

Unnamed: 0,Gender,Sleep Duration,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Depression,age_binned,Working Professional or Student_Working Professional,Dietary Habits_Moderate,Dietary Habits_Unhealthy
0,1,2,0,0.083333,0.25,0,3,1.0,0.0,0.0
1,0,0,1,0.583333,0.5,1,1,1.0,0.0,1.0
3,0,0,1,0.833333,0.0,1,0,1.0,1.0,0.0
6,0,1,0,0.5,0.25,0,3,1.0,1.0,0.0
7,0,1,0,0.833333,0.5,0,2,1.0,0.0,1.0


In [29]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108456 entries, 0 to 140699
Data columns (total 10 columns):
 #   Column                                                Non-Null Count   Dtype   
---  ------                                                --------------   -----   
 0   Gender                                                108456 non-null  int64   
 1   Sleep Duration                                        108456 non-null  int64   
 2   Have you ever had suicidal thoughts ?                 108456 non-null  int64   
 3   Work/Study Hours                                      108456 non-null  float64 
 4   Financial Stress                                      108454 non-null  float64 
 5   Depression                                            108456 non-null  int64   
 6   age_binned                                            108456 non-null  category
 7   Working Professional or Student_Working Professional  108456 non-null  float64 
 8   Dietary Habits_Moderate                

In [30]:
df_test.to_csv("../data/cleaned_test.csv")
df_train.to_csv("../data/cleaned_train.csv")