# Exploring Mental Health Data
**Objective:** Predict whether an individual suffers from depression based on a set of responses from a mental health survey.

**Problem task:** Binary classification on the target variable depression (0 = false, 1 = true)

**Dataset source:** Kaggle - Playground Series S4E11


In [1]:
#Marta path:
#Ricardo path:
#Sara path:
import pandas as pd
train_data = pd.read_csv("./data/train.csv")
print(train_data.head())
train_data.describe()

   id      Name  Gender   Age           City Working Professional or Student  \
0   0  Aaradhya  Female  49.0       Ludhiana            Working Professional   
1   1     Vivan    Male  26.0       Varanasi            Working Professional   
2   2    Yuvraj    Male  33.0  Visakhapatnam                         Student   
3   3    Yuvraj    Male  22.0         Mumbai            Working Professional   
4   4      Rhea  Female  30.0         Kanpur            Working Professional   

         Profession  Academic Pressure  Work Pressure  CGPA  \
0              Chef                NaN            5.0   NaN   
1           Teacher                NaN            4.0   NaN   
2               NaN                5.0            NaN  8.97   
3           Teacher                NaN            5.0   NaN   
4  Business Analyst                NaN            1.0   NaN   

   Study Satisfaction  Job Satisfaction     Sleep Duration Dietary Habits  \
0                 NaN               2.0  More than 8 hours     

Unnamed: 0,id,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,140700.0,140700.0,27897.0,112782.0,27898.0,27897.0,112790.0,140700.0,140696.0,140700.0
mean,70349.5,40.388621,3.142273,2.998998,7.658636,2.94494,2.974404,6.252679,2.988983,0.181713
std,40616.735775,12.384099,1.380457,1.405771,1.464466,1.360197,1.416078,3.853615,1.413633,0.385609
min,0.0,18.0,1.0,1.0,5.03,1.0,1.0,0.0,1.0,0.0
25%,35174.75,29.0,2.0,2.0,6.29,2.0,2.0,3.0,2.0,0.0
50%,70349.5,42.0,3.0,3.0,7.77,3.0,3.0,6.0,3.0,0.0
75%,105524.25,51.0,4.0,4.0,8.92,4.0,4.0,10.0,4.0,0.0
max,140699.0,60.0,5.0,5.0,10.0,5.0,5.0,12.0,5.0,1.0


In [2]:
#duplicate removal
bf = len(train_data)
print(f"Number of rows before removing duplicates: {len(train_data)}")
train_data = train_data.drop_duplicates()
af = len(train_data)
print(f"Number of rows after removing duplicates: {len(train_data)}")
if (bf-af) == 0:
    print("(No dup data found)")

Number of rows before removing duplicates: 140700
Number of rows after removing duplicates: 140700
(No dup data found)


In [3]:
#missing value check
def missing_value_info(df):
    total = df.isnull().sum()
    percent = (total / len(df)) * 100
    return pd.DataFrame({'Missing Values': total, 'Percent Missing': percent}).sort_values(by='Percent Missing', ascending=False)
missing_info_with_0 = missing_value_info(train_data)
missing_info = missing_info_with_0[missing_info_with_0['Percent Missing'] > 0.0]
print(missing_info)
print(missing_info_with_0)


                    Missing Values  Percent Missing
Study Satisfaction          112803        80.172708
Academic Pressure           112803        80.172708
CGPA                        112802        80.171997
Profession                   36630        26.034115
Work Pressure                27918        19.842217
Job Satisfaction             27910        19.836532
Financial Stress                 4         0.002843
Dietary Habits                   4         0.002843
Degree                           2         0.001421
                                       Missing Values  Percent Missing
Study Satisfaction                             112803        80.172708
Academic Pressure                              112803        80.172708
CGPA                                           112802        80.171997
Profession                                      36630        26.034115
Work Pressure                                   27918        19.842217
Job Satisfaction                                27910 

since these columns were spotted for missing values, we want to understand their appearance: how many are missing (NaN count in value_counts), if there unexpected 0s or negative values.

### Data prepp

In [4]:
train_data['Study Satisfaction'].value_counts(dropna=False)
train_data['Academic Pressure'].value_counts(dropna=False)
train_data['CGPA'].value_counts(dropna=False)
train_data['Profession'].value_counts(dropna=False)
train_data['Work Pressure'].value_counts(dropna=False)
train_data['Job Satisfaction'].value_counts(dropna=False)
train_data['Dietary Habits'].value_counts(dropna=False)
train_data['Financial Stress'].value_counts(dropna=False)
train_data['Degree'].value_counts(dropna=False)
#regulated
#Comclusion: all our missing vals are NANS


Degree
Class 12    14729
B.Ed        11691
B.Arch       8742
B.Com        8113
B.Pharm      5856
            ...  
LCA             1
B B.Com         1
RCA             1
Mihir           1
Advait          1
Name: count, Length: 116, dtype: int64

In [None]:
train_data['Profession'].value_counts(dropna=False)

valid_professions = [
    "Teacher", "Content Writer", "Architect", "Consultant", "HR Manager",
    "Pharmacist", "Doctor", "Business Analyst", "Entrepreneur", "Chemist",
    "Chef", "Educational Consultant", "Data Scientist", "Researcher", "Lawyer",
    "Customer Support", "Marketing Manager", "Pilot", "Travel Consultant",
    "Plumber", "Sales Executive", "Manager", "Judge", "Electrician",
    "Financial Analyst", "Software Engineer", "Civil Engineer", "UX/UI Designer",
    "Digital Marketer", "Accountant", "Mechanical Engineer", "Graphic Designer",
    "Research Analyst", "Investment Banker", "Analyst", "Academic", "Unemployed", "Medical Doctor", "City Manager", "Family Consultant"
]
# we will replace the remainder with na
train_data["Profession"] = train_data["Profession"].apply(
    lambda x: x if x in valid_professions else "other"
)
#for now i put na. not sure

train_data['Profession'].value_counts(dropna=False)

# these are names: ["Yogesh", "Pranav", "Dev", "Yuvraj"]
# these seem to be localities ["Patna", "Visakhapatnam", "Nagpur", "FamilyVirar"]
# and these ? what are hey ? not jobs. ["Patna", "Visakhapatnam", "Nagpur", "FamilyVirar"]
#degrees like MBA
#substringing

Profession
other                     37997
Teacher                   24906
Content Writer             7814
Architect                  4370
Consultant                 4229
HR Manager                 4022
Pharmacist                 3893
Doctor                     3255
Business Analyst           3161
Entrepreneur               2968
Chemist                    2967
Chef                       2862
Educational Consultant     2852
Data Scientist             2390
Researcher                 2328
Lawyer                     2212
Customer Support           2055
Marketing Manager          1976
Pilot                      1913
Travel Consultant          1860
Plumber                    1748
Sales Executive            1739
Manager                    1737
Judge                      1712
Electrician                1582
Financial Analyst          1574
Software Engineer          1510
Civil Engineer             1470
UX/UI Designer             1452
Digital Marketer           1372
Accountant                 13

In [34]:
#train_data['Sleep Duration'].value_counts(dropna=False)#.sort(ascending=True)


import re

def normalize_ranges(column):
    def calculate_average(value):
        match = re.match(r"(\d+)\s*-\s*(\d+)", str(value))
        if match:
            x, y = map(int, match.groups())
            return (x + y) / 2 
        return value  

    return column.apply(calculate_average)

def normalize_more_less(column):
    def normalize_value(value):
        more_match = re.match(r"More than (\d+)", str(value))
        if more_match:
            x = int(more_match.group(1))
            return x + 0.5 

        less_match = re.match(r"Less than (\d+)", str(value))
        if less_match:
            x = int(less_match.group(1))
            return x - 0.5  

        return value  
    return column.apply(normalize_value)
def keep_numeric_only(column):
    def filter_numeric(value):
        try:
            return float(value)
        except ValueError:
            return pd.NA  

    return column.apply(filter_numeric)

def normalize_large_numbers(column):
    def process_large_number(value):
        try:
            value = float(value)  
            if value >= 12:
                return round(value / 7 * 2) / 2
            return value 
        except ValueError:
            return pd.NA  

    return column.apply(process_large_number)


train_data['Sleep Duration'] = normalize_ranges(train_data['Sleep Duration'])

train_data['Sleep Duration'] = normalize_more_less(train_data['Sleep Duration'])
train_data['Sleep Duration'] = keep_numeric_only(train_data['Sleep Duration'])
train_data['Sleep Duration'] = normalize_large_numbers(train_data['Sleep Duration'])

print(train_data['Sleep Duration'].value_counts())



TypeError: float() argument must be a string or a real number, not 'NAType'

In [10]:
valid_dietary = ["Moderate", "Unhealthy", "Healthy"]

train_data["Dietary Habits"] = train_data["Dietary Habits"].apply(
    lambda x: x if x in valid_dietary else "other"
)



print(train_data['Dietary Habits'].value_counts(dropna = False))

Dietary Habits
Moderate     49705
Unhealthy    46227
Healthy      44741
other           27
Name: count, dtype: int64
