# Exploring Mental Health Data
**Objective:** Predict whether an individual suffers from depression based on a set of responses from a mental health survey.

**Problem task:** Binary classification on the target variable depression (0 = false, 1 = true)

**Dataset source:** Kaggle - Playground Series S4E11


In [None]:
#Marta path:
#Ricardo path:
#Sara path: "/Users/saracortez/feup/3o ano/iart/exploring_mental_health_data/data/train.csv"
import pandas as pd
train_data = pd.read_csv("/Users/saracortez/feup/3o ano/iart/exploring_mental_health_data/data/train.csv")
test_data = pd.read_csv("/Users/saracortez/feup/3o ano/iart/exploring_mental_health_data/data/test.csv")

print(train_data.head())
train_data.describe()

In [None]:
#duplicate removal
bf = len(train_data)
print(f"Number of rows before removing duplicates: {len(train_data)}")
train_data = train_data.drop_duplicates()
af = len(train_data)
print(f"Number of rows after removing duplicates: {len(train_data)}")
if (bf-af) == 0:
    print("(No dup data found)")

In [None]:
#missing value check
def missing_value_info(df):
    total = df.isnull().sum()
    percent = (total / len(df)) * 100
    return pd.DataFrame({'Missing Values': total, 'Percent Missing': percent}).sort_values(by='Percent Missing', ascending=False)
missing_info_with_0 = missing_value_info(train_data)
missing_info = missing_info_with_0[missing_info_with_0['Percent Missing'] > 0.0]
print(missing_info)
print(missing_info_with_0)


since these columns were spotted for missing values, we want to understand their appearance: how many are missing (NaN count in value_counts), if there unexpected 0s or negative values.

### Data prepp

In [None]:
#train_data['Study Satisfaction'].value_counts(dropna=False)
#train_data['Academic Pressure'].value_counts(dropna=False)
#train_data['CGPA'].value_counts(dropna=False)
#train_data['Profession'].value_counts(dropna=False)
#train_data['Work Pressure'].value_counts(dropna=False)
#train_data['Job Satisfaction'].value_counts(dropna=False)
#train_data['Dietary Habits'].value_counts(dropna=False)
#train_data['Financial Stress'].value_counts(dropna=False)
pd.set_option('display.max_rows', None)
train_data['Degree'].value_counts(dropna=False)
#regulated
#Comclusion: all our missing vals are NANS


In [None]:
# Forçar a visualização completa das contagens
pd.set_option('display.max_rows', None)  # Isso vai permitir que todos os valores sejam exibidos
print(train_data['Profession'].value_counts(dropna=False))


In [None]:
train_data['Profession'].value_counts(dropna=False)

valid_professions = [
    "Teacher", "Content Writer", "Architect", "Consultant", "HR Manager",
    "Pharmacist", "Doctor", "Business Analyst", "Entrepreneur", "Chemist",
    "Chef", "Educational Consultant", "Data Scientist", "Researcher", "Lawyer",
    "Customer Support", "Marketing Manager", "Pilot", "Travel Consultant",
    "Plumber", "Sales Executive", "Manager", "Judge", "Electrician",
    "Financial Analyst", "Software Engineer", "Civil Engineer", "UX/UI Designer",
    "Digital Marketer", "Accountant", "Mechanical Engineer", "Graphic Designer",
    "Research Analyst", "Investment Banker", "Analyst", "Academic", "Unemployed", "Medical Doctor", "City Manager", "Family Consultant"
]

# corrigir erros digitação
def correct_profession(value):
    corrections = {
        "Finanancial Analyst": "Financial Analyst",
        # Adicionar mais ??
    }
    return corrections.get(value, value)

train_data['Profession'] = train_data['Profession'].apply(correct_profession)

test_data['Profession'] = test_data['Profession'].apply(correct_profession)

def clean_profession(value):
    if pd.isna(value):
        return value  # mantém NaN
    return value if value in valid_professions else "other"

train_data['Profession'] = train_data['Profession'].apply(clean_profession)
test_data['Profession'] = test_data['Profession'].apply(clean_profession)

train_data['Profession'].value_counts(dropna=False)

test_data['Profession'].value_counts(dropna=False)

# these are names: ["Yogesh", "Pranav", "Dev", "Yuvraj"]
# these seem to be localities ["Patna", "Visakhapatnam", "Nagpur", "FamilyVirar"]
# and these ? what are hey ? not jobs. ["Patna", "Visakhapatnam", "Nagpur", "FamilyVirar"]
#degrees like MBA
#substringing

In [None]:
import re
# handle ranges like 6-8, handle more than/less than X,handle direct numeric values
print(len(train_data['Sleep Duration']))
#15 rows lost
def normalize_sleep_duration(column):
    def normalize(value):
        value = str(value).strip()

        match_range = re.match(r"(\d+)\s*-\s*(\d+)", value)
        if match_range:
            x, y = map(int, match_range.groups())
            return (x + y) / 2

        match_more = re.match(r"More than (\d+)", value, re.IGNORECASE)
        if match_more:
            return int(match_more.group(1)) + 0.5

        match_less = re.match(r"Less than (\d+)", value, re.IGNORECASE)
        if match_less:
            return int(match_less.group(1)) - 0.5

        try:
            return float(value)
        except ValueError:
            return pd.NA 

    return column.apply(normalize)

def normalize_large_sleep_values(column):
    def adjust_large(value):
        try:
            if pd.notna(value) and value >= 12:
                return round(value / 7 * 2) / 2
            return value
        except:
            return pd.NA
    return column.apply(adjust_large)

train_data['Sleep Duration'] = normalize_sleep_duration(train_data['Sleep Duration'])
train_data['Sleep Duration'] = normalize_large_sleep_values(train_data['Sleep Duration'])
test_data['Sleep Duration'] = normalize_sleep_duration(test_data['Sleep Duration'])
test_data['Sleep Duration'] = normalize_large_sleep_values(test_data['Sleep Duration'])

print(train_data['Sleep Duration'].value_counts())

In [None]:
valid_dietary = ["Moderate", "Unhealthy", "Healthy"]

train_data["Dietary Habits"] = train_data["Dietary Habits"].apply(
    lambda x: x if pd.isna(x) or x in valid_dietary else "other"
)
test_data["Dietary Habits"] = test_data["Dietary Habits"].apply(
    lambda x: x if pd.isna(x) or x in valid_dietary else "other"
)


print(train_data['Dietary Habits'].value_counts(dropna = False))

In [None]:
def normalize_degree(column):
    def clean(item):
        if isinstance(item, str):
            item = item.replace('.', '').replace(' ', '')
            return item
        else:
            return 'invalid'
    
    column = column.apply(clean)
    def remove_names(item):
            if (len(item) > 1 and item[0].isupper() and item[1].isupper() and item[0] in ['L', 'P', 'B', 'M']) or item == 'Class12' or item == "PhD":
                return item
            else:
                return 'invalid' 
    return column.apply(remove_names)
#importante dar NA aos inválidos para dar drop
train_data['Degree'] = normalize_degree(train_data['Degree'])
test_data['Degree'] = normalize_degree(test_data['Degree'])

degree_counts = train_data['Degree'].value_counts()
rare_degrees = degree_counts[degree_counts <= 5].index

train_data['Degree'] = train_data['Degree'].apply(lambda x: 'other' if x in rare_degrees else x)
test_data['Degree'] = test_data['Degree'].apply(lambda x: 'other' if x in rare_degrees else x)
     

print(train_data['Degree'].value_counts(dropna = False))

In [None]:
print(train_data['Work/Study Hours'].value_counts(dropna=False))
"""
train_data['Work/Study Hours'].value_counts(dropna=False)
train_data['Academic Pressure'].value_counts(dropna=False)
train_data['CGPA'].value_counts(dropna=False)
train_data['Profession'].value_counts(dropna=False)
train_data['Work Pressure'].value_counts(dropna=False)
train_data['Job Satisfaction'].value_counts(dropna=False)
train_data['Dietary Habits'].value_counts(dropna=False)
train_data['Financial Stress'].value_counts(dropna=False)
train_data['Degree'].value_counts(dropna=False)"""



we can detect in our data two categories of individuals, identifiable by their attributes:
- **Students:** academic pressure, CGPA, study satisfaction, degree
- **Worker Professionals:** work pressure, profession, job satisfaction

Our decision tree classifier requires binary values. Thus, let's convert bicategorical variables in to 0/1.

In [None]:

# Gender column: Male -> 1, Female -> 0
train_data['Gender'] = train_data['Gender'].replace({'Male': 1, 'Female': 0})
train_data['Working Professional or Student'] = train_data['Working Professional or Student'].replace({'Working Professional': 1, 'Student': 0})

# Have you ever had suicidal thoughts?
train_data['Have you ever had suicidal thoughts ?'] = train_data['Have you ever had suicidal thoughts ?'].replace({'Yes': 1, 'No': 0})

train_data['Family History of Mental Illness'] = train_data['Family History of Mental Illness'].replace({'Yes': 1, 'No': 0})

test_data['Gender'] = test_data['Gender'].replace({'Male': 1, 'Female': 0})
test_data['Working Professional or Student'] = test_data['Working Professional or Student'].replace({'Working Professional': 1, 'Student': 0})

test_data['Have you ever had suicidal thoughts ?'] = test_data['Have you ever had suicidal thoughts ?'].replace({'Yes': 1, 'No': 0})

test_data['Family History of Mental Illness'] = test_data['Family History of Mental Illness'].replace({'Yes': 1, 'No': 0})
#print(train_data['Gender']).value_counts().sort(ascending=false)

In [None]:
#UNDERSAMPLING COM PRIORIDADE

from sklearn.utils import resample
import pandas as pd

X = train_data.drop("Depression", axis=1)
y = train_data["Depression"]

data = pd.concat([X, y], axis=1)

class_counts = data["Depression"].value_counts()
min_class_size = class_counts.min()

priority_cols = ["Profession", "Degree", "Dietary Habits"]

balanced_data = []

for label in class_counts.index:
    subset = data[data["Depression"] == label]

    if len(subset) > min_class_size:
        
        to_keep = min_class_size
        # Primeira prioridade: linhas com "invalid"
        invalid_rows = subset[subset[priority_cols].isin(["invalid"]).any(axis=1)]
        subset = subset.drop(invalid_rows.index)

        # Segunda prioridade: linhas com "other"
        other_rows = subset[subset[priority_cols].isin(["other"]).any(axis=1)]
        subset = subset.drop(other_rows.index)

        remaining_needed = to_keep


        if len(subset) >= remaining_needed:
            to_sample = subset.sample(remaining_needed, random_state=42)
        else:
            # Remover todos os 'priority' e sortear os restantes
            rows_needed = remaining_needed - len(subset)

            if len(other_rows) >= rows_needed:
                to_sample = pd.concat([subset, other_rows.sample(rows_needed, random_state=42)])
            else:
                still_needed = rows_needed - len(other_rows)
                to_sample = pd.concat([
                    subset,
                    other_rows,
                    invalid_rows.sample(still_needed, random_state=42)
                ])
    else:

        to_sample = subset

    balanced_data.append(to_sample)

undersampled_data = pd.concat(balanced_data)

undersampled_data.to_csv("undersampled_dataset.csv", index=False)


In [None]:
train_data = undersampled_data.copy()

In [None]:
train_data['Satisfaction'] = train_data[['Job Satisfaction', 'Study Satisfaction']].mean(axis=1, skipna=True)

train_data = train_data.drop(columns=['Job Satisfaction', 'Study Satisfaction'])
train_data['Pressure'] = train_data[['Work Pressure', 'Academic Pressure']].mean(axis=1, skipna=True)

train_data = train_data.drop(columns=['Work Pressure', 'Academic Pressure'])
print(train_data[['Pressure']].head())

print(train_data[['Satisfaction']].head())

train_data.loc[train_data['CGPA'].notna() & train_data['Profession'].isna(), 'Profession'] = 'Student'
train_data = train_data.drop(columns=['CGPA'])
print(train_data[['Profession']].head())

test_data['Satisfaction'] = test_data[['Job Satisfaction', 'Study Satisfaction']].mean(axis=1, skipna=True)

test_data = test_data.drop(columns=['Job Satisfaction', 'Study Satisfaction'])
test_data['Pressure'] = test_data[['Work Pressure', 'Academic Pressure']].mean(axis=1, skipna=True)

test_data = test_data.drop(columns=['Work Pressure', 'Academic Pressure'])


test_data.loc[test_data['CGPA'].notna() & test_data['Profession'].isna(), 'Profession'] = 'Student'
test_data = test_data.drop(columns=['CGPA'])


In [None]:
train_data.to_csv("train_merge.csv", index=False)

In [None]:
test_data.to_csv("test_merge.csv", index=False)

In [None]:
import pandas as pd
train_data_clean = pd.read_csv("train_merge.csv")
print(train_data_clean.columns)
print(train_data_clean.head())


## Plotting

In [None]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.scatterplot(x='Work/Study Hours', y='Sleep Duration',
                hue='Have you ever had suicidal thoughts ?', data=train_data_clean)
plt.title('Work Hours vs. Sleep Duration')
plt.show()

ALGORITHMS

In [None]:
sb.pairplot(train_data_clean[[
    'Satisfaction', 'Sleep Duration', 'Work/Study Hours',
    'Have you ever had suicidal thoughts ?'
]], hue='Have you ever had suicidal thoughts ?')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


training_data = train_data_clean.copy()

training_data.drop(columns=['id', 'Name', 'City', 'Profession'], inplace=True)

categorical_cols = training_data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    training_data[col] = le.fit_transform(training_data[col].astype(str))
    label_encoders[col] = le

X = training_data.drop(columns=['Depression'])
y = training_data['Depression']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)

cf_mx = confusion_matrix(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion matrix\n", cf_mx)
    #ACCURACY DE 86

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

imputer = SimpleImputer(strategy='mean')

X_train_imputed = imputer.fit_transform(X_train)

X_val_imputed = imputer.transform(X_val)

clf = MLPClassifier(max_iter=1000)
clf.fit(X_train_imputed, y_train)

y_pred = clf.predict(X_val_imputed)

cf_mx = confusion_matrix(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion matrix\n", cf_mx)