In [None]:
# !pip install missingno



## Librerías

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from missingno import matrix
import missingno as msno

## Carga de datos

In [12]:
df = pd.read_csv(r'C:\Users\nuria\OneDrive\Escritorio\ML_student_depression\data\raw\Student Depression Dataset.csv')
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


## Tratamiento de missings

In [13]:
print(df.isnull().sum())

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [14]:
# Imputar los valores nan a la mediana
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
df['Financial Stress'] = imputer.fit_transform(df[['Financial Stress']])
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [15]:
print(df.isnull().sum())

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64


## Tratamiento de outliers

In [16]:
# Identificar los outliers usando el método IQR
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

# Definir los límites para considerar un outlier
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

'''Imputación de outliers de la columna Age a la moda'''

# Calcular la moda de la columna Age
mode_age = df['Age'].mode()[0]  # Obtenemos el primer valor de la moda

# Imputar los outliers a la moda
df['Age'] = df['Age'].apply(lambda x: mode_age if x < lower_bound or x > upper_bound else x)

In [17]:
# Calcular la media de la columna CGPA
mean_cgpa = df['CGPA'].mean()

'''Imputación de outliers de la columna CGPA a la media'''

# Imputar los outliers a la media
df['CGPA'] = df['CGPA'].apply(lambda x: mean_cgpa if x < lower_bound or x > upper_bound else x)

## Feature engineering

In [18]:
df['Degree'].unique()

array(['B.Pharm', 'BSc', 'BA', 'BCA', 'M.Tech', 'PhD', 'Class 12', 'B.Ed',
       'LLB', 'BE', 'M.Ed', 'MSc', 'BHM', 'M.Pharm', 'MCA', 'MA', 'B.Com',
       'MD', 'MBA', 'MBBS', 'M.Com', 'B.Arch', 'LLM', 'B.Tech', 'BBA',
       'ME', 'MHM', 'Others'], dtype=object)

In [19]:
# One-Hot Encoding para crear columnas binarias a partir de la columna 'Degree'

df_one_hot = pd.get_dummies(df['Degree'], prefix='Degree')
df = pd.concat([df, df_one_hot], axis=1)
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,...,Degree_MA,Degree_MBA,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_Others,Degree_PhD
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,8,Female,24.0,Bangalore,Student,2.0,0.0,7.656104,5.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.656104,5.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,30,Female,28.0,Varanasi,Student,3.0,0.0,7.656104,2.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [20]:
df.columns

Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression',
       'Degree_B.Arch', 'Degree_B.Com', 'Degree_B.Ed', 'Degree_B.Pharm',
       'Degree_B.Tech', 'Degree_BA', 'Degree_BBA', 'Degree_BCA', 'Degree_BE',
       'Degree_BHM', 'Degree_BSc', 'Degree_Class 12', 'Degree_LLB',
       'Degree_LLM', 'Degree_M.Com', 'Degree_M.Ed', 'Degree_M.Pharm',
       'Degree_M.Tech', 'Degree_MA', 'Degree_MBA', 'Degree_MBBS', 'Degree_MCA',
       'Degree_MD', 'Degree_ME', 'Degree_MHM', 'Degree_MSc', 'Degree_Others',
       'Degree_PhD'],
      dtype='object')

In [None]:
# Estandarización de los datos de la columna             dado que el rango de los datos es amplio

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['Degree_Scaled'] = scaler.fit_transform(df[['Degree']])


## Eliminación de features

In [None]:
print(df.shape)
print('-----------------------------------------------------------------------------------')
print(df.columns)
print('-----------------------------------------------------------------------------------')
print(df.info())
print('-----------------------------------------------------------------------------------')

In [None]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
df_categorical =[]
for column in df.columns:
    if df[column].dtypes=="object":
        df_categorical.append(column)
for column in df_categorical: 
    df[column] = encode.fit_transform(df[column])

In [None]:
# Guarda el DataFrame modificado en un nuevo archivo CSV
df.to_csv("dataset_limpio_FE.csv", index=False)