# Exploring Mental Health Data
**Objective:** Predict whether an individual suffers from depression based on a set of responses from a mental health survey.

**Problem task:** Binary classification on the target variable depression (0 = false, 1 = true)

**Dataset source:** Kaggle - Playground Series S4E11


In [None]:
#Marta path:
#Ricardo path:
#Sara path: "/Users/saracortez/feup/3o ano/iart/exploring_mental_health_data/data/train.csv"
import pandas as pd
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

print(train_data.head())
train_data.describe()

In [None]:
#duplicate removal
bf = len(train_data)
print(f"Number of rows before removing duplicates: {len(train_data)}")
train_data = train_data.drop_duplicates()
af = len(train_data)
print(f"Number of rows after removing duplicates: {len(train_data)}")
if (bf-af) == 0:
    print("(No dup data found)")

In [None]:
#missing value check
def missing_value_info(df):
    total = df.isnull().sum()
    percent = (total / len(df)) * 100
    return pd.DataFrame({'Missing Values': total, 'Percent Missing': percent}).sort_values(by='Percent Missing', ascending=False)
missing_info_with_0 = missing_value_info(train_data)
missing_info = missing_info_with_0[missing_info_with_0['Percent Missing'] > 0.0]
print(missing_info)
print(missing_info_with_0)


since these columns were spotted for missing values, we want to understand their appearance: how many are missing (NaN count in value_counts), if there unexpected 0s or negative values.

### Data prepp

In [None]:
#train_data['Study Satisfaction'].value_counts(dropna=False)
#train_data['Academic Pressure'].value_counts(dropna=False)
#train_data['CGPA'].value_counts(dropna=False)
#train_data['Profession'].value_counts(dropna=False)
#train_data['Work Pressure'].value_counts(dropna=False)
#train_data['Job Satisfaction'].value_counts(dropna=False)
#train_data['Dietary Habits'].value_counts(dropna=False)
#train_data['Financial Stress'].value_counts(dropna=False)
pd.set_option('display.max_rows', None)
train_data['Degree'].value_counts(dropna=False)
#regulated
#Comclusion: all our missing vals are NANS


In [None]:
# Forçar a visualização completa das contagens
pd.set_option('display.max_rows', None)  # Isso vai permitir que todos os valores sejam exibidos
print(train_data['Profession'].value_counts(dropna=False))


In [None]:
train_data['Profession'].value_counts(dropna=False)

valid_professions = [
    "Teacher", "Content Writer", "Architect", "Consultant", "HR Manager",
    "Pharmacist", "Doctor", "Business Analyst", "Entrepreneur", "Chemist",
    "Chef", "Educational Consultant", "Data Scientist", "Researcher", "Lawyer",
    "Customer Support", "Marketing Manager", "Pilot", "Travel Consultant",
    "Plumber", "Sales Executive", "Manager", "Judge", "Electrician",
    "Financial Analyst", "Software Engineer", "Civil Engineer", "UX/UI Designer",
    "Digital Marketer", "Accountant", "Mechanical Engineer", "Graphic Designer",
    "Research Analyst", "Investment Banker", "Analyst", "Academic", "Unemployed", "Medical Doctor", "City Manager", "Family Consultant"
]

# corrigir erros digitação
def correct_profession(value):
    corrections = {
        "Finanancial Analyst": "Financial Analyst",
        # Adicionar mais ??
    }
    return corrections.get(value, value)

train_data['Profession'] = train_data['Profession'].apply(correct_profession)

test_data['Profession'] = test_data['Profession'].apply(correct_profession)

def clean_profession(value):
    if pd.isna(value):
        return value  # mantém NaN
    return value if value in valid_professions else "other"

train_data['Profession'] = train_data['Profession'].apply(clean_profession)
test_data['Profession'] = test_data['Profession'].apply(clean_profession)

train_data['Profession'].value_counts(dropna=False)

test_data['Profession'].value_counts(dropna=False)

# these are names: ["Yogesh", "Pranav", "Dev", "Yuvraj"]
# these seem to be localities ["Patna", "Visakhapatnam", "Nagpur", "FamilyVirar"]
# and these ? what are hey ? not jobs. ["Patna", "Visakhapatnam", "Nagpur", "FamilyVirar"]
#degrees like MBA
#substringing

In [None]:
import re
# handle ranges like 6-8, handle more than/less than X,handle direct numeric values
print(len(train_data['Sleep Duration']))
#15 rows lost
def normalize_sleep_duration(column):
    def normalize(value):
        value = str(value).strip()

        match_range = re.match(r"(\d+)\s*-\s*(\d+)", value)
        if match_range:
            x, y = map(int, match_range.groups())
            return (x + y) / 2

        match_more = re.match(r"More than (\d+)", value, re.IGNORECASE)
        if match_more:
            return int(match_more.group(1)) + 0.5

        match_less = re.match(r"Less than (\d+)", value, re.IGNORECASE)
        if match_less:
            return int(match_less.group(1)) - 0.5

        try:
            return float(value)
        except ValueError:
            return pd.NA 

    return column.apply(normalize)

def normalize_large_sleep_values(column):
    def adjust_large(value):
        try:
            if pd.notna(value) and value >= 12:
                return round(value / 7 * 2) / 2
            return value
        except:
            return pd.NA
    return column.apply(adjust_large)

train_data['Sleep Duration'] = normalize_sleep_duration(train_data['Sleep Duration'])
train_data['Sleep Duration'] = normalize_large_sleep_values(train_data['Sleep Duration'])
test_data['Sleep Duration'] = normalize_sleep_duration(test_data['Sleep Duration'])
test_data['Sleep Duration'] = normalize_large_sleep_values(test_data['Sleep Duration'])

print(train_data['Sleep Duration'].value_counts())

In [None]:
valid_dietary = ["Moderate", "Unhealthy", "Healthy"]

train_data["Dietary Habits"] = train_data["Dietary Habits"].apply(
    lambda x: x if pd.isna(x) or x in valid_dietary else "other"
)
test_data["Dietary Habits"] = test_data["Dietary Habits"].apply(
    lambda x: x if pd.isna(x) or x in valid_dietary else "other"
)


print(train_data['Dietary Habits'].value_counts(dropna = False))

In [None]:
def normalize_degree(column):
    def clean(item):
        if isinstance(item, str):
            item = item.replace('.', '').replace(' ', '')
            return item
        else:
            return 'invalid'
    
    column = column.apply(clean)
    def remove_names(item):
            if (len(item) > 1 and item[0].isupper() and item[1].isupper() and item[0] in ['L', 'P', 'B', 'M']) or item == 'Class12' or item == "PhD":
                return item
            else:
                return 'invalid' 
    return column.apply(remove_names)
#importante dar NA aos inválidos para dar drop
train_data['Degree'] = normalize_degree(train_data['Degree'])
test_data['Degree'] = normalize_degree(test_data['Degree'])

degree_counts = train_data['Degree'].value_counts()
rare_degrees = degree_counts[degree_counts <= 5].index

train_data['Degree'] = train_data['Degree'].apply(lambda x: 'other' if x in rare_degrees else x)
test_data['Degree'] = test_data['Degree'].apply(lambda x: 'other' if x in rare_degrees else x)
     

print(train_data['Degree'].value_counts(dropna = False))

In [None]:
print(train_data['Work/Study Hours'].value_counts(dropna=False))
"""
train_data['Work/Study Hours'].value_counts(dropna=False)
train_data['Academic Pressure'].value_counts(dropna=False)
train_data['CGPA'].value_counts(dropna=False)
train_data['Profession'].value_counts(dropna=False)
train_data['Work Pressure'].value_counts(dropna=False)
train_data['Job Satisfaction'].value_counts(dropna=False)
train_data['Dietary Habits'].value_counts(dropna=False)
train_data['Financial Stress'].value_counts(dropna=False)
train_data['Degree'].value_counts(dropna=False)"""



we can detect in our data two categories of individuals, identifiable by their attributes:
- **Students:** academic pressure, CGPA, study satisfaction, degree
- **Worker Professionals:** work pressure, profession, job satisfaction

Our decision tree classifier requires binary values. Thus, let's convert bicategorical variables in to 0/1.

In [None]:

# Gender column: Male -> 1, Female -> 0
train_data['Gender'] = train_data['Gender'].replace({'Male': 1, 'Female': 0})

train_data['Working Professional or Student'] = train_data['Working Professional or Student'].replace({'Working Professional': 1, 'Student': 0})

# Have you ever had suicidal thoughts?
train_data['Have you ever had suicidal thoughts ?'] = train_data['Have you ever had suicidal thoughts ?'].replace({'Yes': 1, 'No': 0})

train_data['Family History of Mental Illness'] = train_data['Family History of Mental Illness'].replace({'Yes': 1, 'No': 0})

test_data['Gender'] = test_data['Gender'].replace({'Male': 1, 'Female': 0})

test_data['Working Professional or Student'] = test_data['Working Professional or Student'].replace({'Working Professional': 1, 'Student': 0})

test_data['Have you ever had suicidal thoughts ?'] = test_data['Have you ever had suicidal thoughts ?'].replace({'Yes': 1, 'No': 0})

test_data['Family History of Mental Illness'] = test_data['Family History of Mental Illness'].replace({'Yes': 1, 'No': 0})
#print(train_data['Gender']).value_counts().sort(ascending=false)

In [None]:
train_data['Satisfaction'] = train_data[['Job Satisfaction', 'Study Satisfaction']].mean(axis=1, skipna=True)

train_data = train_data.drop(columns=['Job Satisfaction', 'Study Satisfaction'])
train_data['Pressure'] = train_data[['Work Pressure', 'Academic Pressure']].mean(axis=1, skipna=True)

train_data = train_data.drop(columns=['Work Pressure', 'Academic Pressure'])
print(train_data[['Pressure']].head())

print(train_data[['Satisfaction']].head())

# Preencher 'Profession' onde está vazia, com base na coluna "Working Professional or Student"
train_data.loc[
    train_data['Profession'].isna() & (train_data['Working Professional or Student'] == 0),
    'Profession'
] = 'Student'

train_data.loc[
    train_data['Profession'].isna() & (train_data['Working Professional or Student'] != 0),
    'Profession'
] = 'other'
train_data = train_data.drop(columns=['CGPA'])
print(train_data[['Profession']].head())

test_data['Satisfaction'] = test_data[['Job Satisfaction', 'Study Satisfaction']].mean(axis=1, skipna=True)

test_data = test_data.drop(columns=['Job Satisfaction', 'Study Satisfaction'])
test_data['Pressure'] = test_data[['Work Pressure', 'Academic Pressure']].mean(axis=1, skipna=True)

test_data = test_data.drop(columns=['Work Pressure', 'Academic Pressure'])


# Preencher 'Profession' onde está vazia, com base na coluna "Working Professional or Student"
test_data.loc[
    test_data['Profession'].isna() & (test_data['Working Professional or Student'] == 0),
    'Profession'
] = 'Student'

test_data.loc[
    test_data['Profession'].isna() & (test_data['Working Professional or Student'] != 0),
    'Profession'
] = 'other'
test_data = test_data.drop(columns=['CGPA'])


In [None]:
train_data.to_csv("train_merge.csv", index=False)
test_data.to_csv("test_merge.csv", index=False)

In [None]:
#UNDERSAMPLING COM PRIORIDADE

from sklearn.utils import resample
import pandas as pd

X = train_data.drop("Depression", axis=1)
y = train_data["Depression"]

data = pd.concat([X, y], axis=1)

class_counts = data["Depression"].value_counts()
min_class_size = class_counts.min()

priority_cols = ["Profession", "Degree", "Dietary Habits"]

balanced_data = []

for label in class_counts.index:
    subset = data[data["Depression"] == label]

    if len(subset) > min_class_size:
        
        to_keep = min_class_size
        
        na_rows = subset[priority_cols].isna().any(axis=1)
        subset = subset[~na_rows]

        # Primeira prioridade: linhas com "invalid"
        invalid_rows = subset[subset[priority_cols].isin(["invalid"]).any(axis=1)]
        subset = subset.drop(invalid_rows.index)

        # Segunda prioridade: linhas com "other"
        other_rows = subset[subset[priority_cols].isin(["other"]).any(axis=1)]
        subset = subset.drop(other_rows.index)

        remaining_needed = to_keep


        if len(subset) >= remaining_needed:
            to_sample = subset.sample(remaining_needed, random_state=42)
        else:
            # Remover todos os 'priority' e sortear os restantes
            rows_needed = remaining_needed - len(subset)

            if len(other_rows) >= rows_needed:
                to_sample = pd.concat([subset, other_rows.sample(rows_needed, random_state=42)])
            else:
                still_needed = rows_needed - len(other_rows)
                to_sample = pd.concat([
                    subset,
                    other_rows,
                    invalid_rows.sample(still_needed, random_state=42)
                ])
    else:

        to_sample = subset

    balanced_data.append(to_sample)

undersampled_data = pd.concat(balanced_data)

undersampled_data.to_csv("final_train_dataset.csv", index=False)


In [None]:
train_data = undersampled_data.copy()

In [None]:
print(train_data[['Satisfaction']].head())

In [None]:
import pandas as pd
df = pd.read_csv("train_merge.csv")
print(df.columns)
print(df.head())


ALGORITHMS

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np


# Copiar o dataset para n mudar dados do original
data = df.copy()
data_test = test_data.copy()

# Remover colunas não relevantes
data.drop(columns=['id', 'Name'], inplace=True)
data_test.drop(columns=['id', 'Name'], inplace=True)

categorical_cols_train = data.select_dtypes(include=['object']).columns #identifica todas as colunas com valores strings

#categorical_cols_test = data_test.select_dtypes(include=['object']).columns #identifica todas as colunas com valores strings

# transforma as colunas com "palavras" em numeros
label_encoders = {}
for col in categorical_cols_train:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

for col in categorical_cols_train:
    if col in data_test.columns:
        le = label_encoders[col]
        # Substituir valores desconhecidos por 'unknown'
        data_test[col] = data_test[col].apply(lambda x: x if x in le.classes_ else 'unknown')

        # Expandir os classes_ com 'unknown' (LabelEncoder precisa disto!)
        if 'unknown' not in le.classes_:
            le.classes_ = np.append(le.classes_, 'unknown')

        data_test[col] = le.transform(data_test[col].astype(str))


X_train = data.drop(columns='Depression') #features para determinar a depressao
y_train = data['Depression'] #alvo

X_test = data_test[X_train.columns]


# Tratar valores nulos (estou a por a média quando tem valores nulos)
imputer = SimpleImputer(strategy='mean')
train_data_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
# Converter todas as colunas para numérico
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Agora aplicar imputação
test_data_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

#print(X_test.dtypes)
#print(X_test.isna().sum())  # ver se ainda há NaNs



# Dividir em treino e teste
X_trainSplit, X_testSplit, y_trainSplit, y_testSplit = train_test_split(train_data_imputed, y_train, test_size=0.2, random_state=42)

# Padronizar os dados
#Este é com o split para fazermos as métricas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_trainSplit)
X_test_scaled = scaler.transform(X_testSplit)


X_train_scaled[:5]

# Padronizar os dados (por tudo na mesma escala : nao pode idade(0-100) e genero(0 ou 1), é preciso escalar)
# ESte é para os dados de teste real que queremos tentar prever
scaler = StandardScaler()
X_train_scaled_real = scaler.fit_transform(train_data_imputed)
X_test_scaled_real = scaler.transform(test_data_imputed)

#X_train_scaled[:5]


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Instanciar e treinar o modelo
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_trainSplit)

# Fazer previsões
y_pred = knn.predict(X_test_scaled)

# Avaliar o desempenho
accuracy = accuracy_score(y_testSplit, y_pred)
conf_matrix = confusion_matrix(y_testSplit, y_pred)
class_report = classification_report(y_testSplit, y_pred)

class_report


graphics / confusion matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve, auc

# Confusion matrix heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Depression', 'Depression'],
            yticklabels=['No Depression', 'Depression'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - KNN')
plt.tight_layout()
plt.show()

y_prob = knn.predict_proba(X_test_scaled)[:, 1]
fpr, tpr, thresholds = roc_curve(y_testSplit, y_prob)
roc_auc = auc(fpr, tpr)

# ROC Curve
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label=f'KNN (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - KNN')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


variação valor k

In [None]:
import numpy as np

# Testar vários valores de k (1 a 20)
k_values = range(1, 11)
accuracies = []

for k in k_values:
    knn_k = KNeighborsClassifier(n_neighbors=k)
    knn_k.fit(X_train_scaled, y_trainSplit)
    y_pred_k = knn_k.predict(X_test_scaled)
    acc = accuracy_score(y_testSplit, y_pred_k)
    accuracies.append(acc)

# Plot da acurácia vs. número de vizinhos (k)
plt.figure(figsize=(8, 5))
plt.plot(k_values, accuracies, marker='o')
plt.title('KNN Accuracy for Different Values of k')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.xticks(k_values)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# Calcular métricas individualmente
precision_0 = precision_score(y_testSplit, y_pred, pos_label=0)
recall_0 = recall_score(y_testSplit, y_pred, pos_label=0)
f1_0 = f1_score(y_testSplit, y_pred, pos_label=0)

precision_1 = precision_score(y_testSplit, y_pred, pos_label=1)
recall_1 = recall_score(y_testSplit, y_pred, pos_label=1)
f1_1 = f1_score(y_testSplit, y_pred, pos_label=1)

# Criar DataFrame com as métricas
metrics_df = pd.DataFrame({
    'Class': ['No Depression (0)', 'Depression (1)'],
    'Precision': [precision_0, precision_1],
    'Recall': [recall_0, recall_1],
    'F1-score': [f1_0, f1_1],
    'Support': [conf_matrix[0, 0] + conf_matrix[0, 1], conf_matrix[1, 0] + conf_matrix[1, 1]]
})

print(metrics_df)



In [None]:
#Tentar adivinhar os reais

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Instanciar e treinar o modelo
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled_real, y_train)

# Fazer previsões
y_pred = knn.predict(X_test_scaled_real)

# Mostrar as previsões
print("Previsões do modelo:")
print(y_pred)