# Analysis for mental health data

## Import and Load

In [54]:
import pandas as pd
import seaborn as sns

In [9]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

print(f"df_train: {len(df_train)} rows x {len(df_train.columns)} columns")
print(f"df_test: {len(df_test)} rows x {len(df_test.columns)} columns")

df_train: 140700 rows x 20 columns
df_test: 93800 rows x 19 columns


## Missing Values

In [15]:
pd.DataFrame({
    'train': df_train.isna().sum().to_dict(),
    'test': df_test.isna().sum().to_dict(),
})

Unnamed: 0,train,test
id,0,0.0
Name,0,0.0
Gender,0,0.0
Age,0,0.0
City,0,0.0
Working Professional or Student,0,0.0
Profession,36630,24632.0
Academic Pressure,112803,75033.0
Work Pressure,27918,18778.0
CGPA,112802,75034.0


### Profession Missing Values

We complete this column with the column "Working Professional or Student", i.e., if the value of the column "Profession" is missing we fill it with the value of the column "Working Professional or Student"

In [35]:
df_train[["Working Professional or Student"]][df_train.Profession.isna() == True].value_counts()

Working Professional or Student
Student                            27867
Working Professional                8763
Name: count, dtype: int64

In [36]:
df_train[["Working Professional or Student"]].value_counts()

Working Professional or Student
Working Professional               112799
Student                             27901
Name: count, dtype: int64

In [37]:
df_train["Profession"] = df_train["Profession"].fillna(df_train["Working Professional or Student"])
df_test["Profession"] = df_test["Profession"].fillna(df_test["Working Professional or Student"])

### Accademic Preassure & Work Pressure

We extract just one column "Pressure" from these two columns. As they are complementary.

In [43]:
df_train[["Working Professional or Student"]][df_train["Academic Pressure"].isna() == True].value_counts()

Working Professional or Student
Working Professional               112794
Student                                 9
Name: count, dtype: int64

In [45]:
df_train[["Working Professional or Student"]][df_train["Work Pressure"].isna() == True].value_counts()

Working Professional or Student
Student                            27898
Working Professional                  20
Name: count, dtype: int64

In [47]:
df_train["Pressure"] = df_train["Academic Pressure"].fillna(df_train["Work Pressure"])
df_test["Pressure"] = df_test["Academic Pressure"].fillna(df_test["Work Pressure"])

### CGPA

We complete this column with 0 as if is missing, then you are not a student and so you have 0.

In [49]:
df_train[["Working Professional or Student"]][df_train["CGPA"].isna() == True].value_counts()

Working Professional or Student
Working Professional               112793
Student                                 9
Name: count, dtype: int64

In [51]:
df_train["CGPA"] = df_train["CGPA"].fillna(0)
df_test["CGPA"] = df_test["CGPA"].fillna(0)

### Study Satisfaction and Work Satisfaction

Same reasonement as "Pressure".

In [52]:
df_train["Satisfaction"] = df_train["Study Satisfaction"].fillna(df_train["Job Satisfaction"])
df_test["Satisfaction"] = df_test["Study Satisfaction"].fillna(df_test["Job Satisfaction"])

## Features Engeneering

### Degree

In [114]:
bachelor = {'BACHELOR', 'BSTUDENT','B', 'BE', 'BSC', 'BCA', 'BHM', 'BA', 'BBA', 'BPHARM', 'BARCH', 'BTECH', 'LLB', 'BED', 'BCOM', 'BBCOM', 'BPA', 'BHOPAL'}
master = {'MASTER', 'M', 'MBA', 'MCA', 'MA', 'ME', 'MCOM', 'MSC', 'MTECH', 'LLM', 'M.PHARM', 'MPA', 'MD', 'MHM','LLS', 'LLED', 'LCA', 'LLBA', 'LLCOM', 'LED', 'MED', 'MBBS', 'LLBED', 'LHM', 'LLTECH'}
high_school = {'HIGHSCHOOL', 'CLASS12', 'PLUMBER', 'CLASS', 'CLASS11'}

def categorize_degree(degree):
    degree = str(degree).replace('.', '').replace(' ', '').replace('_', '').replace('-', '').upper()
    if degree in bachelor:
        return 'bachelor'
    elif degree in master:
        return 'master'
    elif degree in {'PHD', 'DOCTOR', 'DOCTORATE'}:
        return 'doctorate'
    elif degree in high_school:
        return "high_school"
    else:
        return None

df_train['Degree'] = df_train.Degree.apply(lambda x: categorize_degree(degree=x))
df_test['Degree'] = df_test.Degree.apply(lambda x: categorize_degree(degree=x))