In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Lendo o DataFrame
df = pd.read_csv('final_depression_dataset_1.csv')
df.columns

Index(['Name', 'Gender', 'Age', 'City', 'Working Professional or Student',
       'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA',
       'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
       'Work/Study Hours', 'Financial Stress',
       'Family History of Mental Illness', 'Depression'],
      dtype='object')

In [3]:
# Convertendo as colunas para o melhor tipo e arrumando os espaços nos nomes
for col in df.columns:
    df[col].convert_dtypes()
    
    df.rename(columns={
        col: col.lower().replace(" ", "_")
    }, inplace=True)

In [4]:
# Informações da coluna, assim como as informações por trás de cada tipagem
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   name                                   2556 non-null   object 
 1   gender                                 2556 non-null   object 
 2   age                                    2556 non-null   int64  
 3   city                                   2556 non-null   object 
 4   working_professional_or_student        2556 non-null   object 
 5   profession                             1883 non-null   object 
 6   academic_pressure                      502 non-null    float64
 7   work_pressure                          2054 non-null   float64
 8   cgpa                                   502 non-null    float64
 9   study_satisfaction                     502 non-null    float64
 10  job_satisfaction                       2054 non-null   float64
 11  slee

In [5]:
# Pegando as cinco primeiras linhas
df.head()

Unnamed: 0,name,gender,age,city,working_professional_or_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts_?,work/study_hours,financial_stress,family_history_of_mental_illness,depression
0,Pooja,Female,37,Ghaziabad,Working Professional,Teacher,,2.0,,,4.0,7-8 hours,Moderate,MA,No,6,2,No,No
1,Reyansh,Male,60,Kalyan,Working Professional,Financial Analyst,,4.0,,,3.0,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No
2,Manvi,Female,42,Bhopal,Working Professional,Teacher,,2.0,,,3.0,5-6 hours,Moderate,M.Com,No,0,2,No,No
3,Isha,Female,44,Thane,Working Professional,Teacher,,3.0,,,5.0,7-8 hours,Healthy,MD,Yes,1,2,Yes,No
4,Aarav,Male,48,Indore,Working Professional,UX/UI Designer,,4.0,,,3.0,7-8 hours,Moderate,BE,Yes,6,5,Yes,No


In [6]:
# Fazendo a contagem atual de quais valores temos na coluna de 
df['working_professional_or_student'].value_counts()

working_professional_or_student
Working Professional    2054
Student                  502
Name: count, dtype: int64

In [7]:
df['working_professional_or_student'].replace({
    "Working Professional": 1,
    "Student": 0
}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['working_professional_or_student'].replace({
  df['working_professional_or_student'].replace({


In [8]:
df.rename(columns={
    "working_professional_or_student": "isWorking"
})

Unnamed: 0,name,gender,age,city,isWorking,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts_?,work/study_hours,financial_stress,family_history_of_mental_illness,depression
0,Pooja,Female,37,Ghaziabad,1,Teacher,,2.0,,,4.0,7-8 hours,Moderate,MA,No,6,2,No,No
1,Reyansh,Male,60,Kalyan,1,Financial Analyst,,4.0,,,3.0,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No
2,Manvi,Female,42,Bhopal,1,Teacher,,2.0,,,3.0,5-6 hours,Moderate,M.Com,No,0,2,No,No
3,Isha,Female,44,Thane,1,Teacher,,3.0,,,5.0,7-8 hours,Healthy,MD,Yes,1,2,Yes,No
4,Aarav,Male,48,Indore,1,UX/UI Designer,,4.0,,,3.0,7-8 hours,Moderate,BE,Yes,6,5,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551,Raghavendra,Male,25,Bangalore,1,Consultant,,1.0,,,5.0,5-6 hours,Healthy,BBA,Yes,12,3,Yes,No
2552,Pihu,Female,23,Pune,1,Teacher,,3.0,,,1.0,Less than 5 hours,Moderate,MA,Yes,8,3,No,Yes
2553,Sara,Female,24,Srinagar,1,HR Manager,,1.0,,,4.0,Less than 5 hours,Moderate,BA,Yes,4,4,No,No
2554,Eshita,Female,56,Bangalore,1,Business Analyst,,2.0,,,3.0,7-8 hours,Healthy,BBA,No,4,5,Yes,No


In [9]:
df['academic_pressure'].fillna(0, inplace=True)

df['work_pressure'].fillna(0, inplace=True)

df['study_satisfaction'].fillna(0, inplace=True)

df['job_satisfaction'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['academic_pressure'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['work_pressure'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

In [25]:
df['general_pressure'] = df['academic_pressure'] + df['work_pressure']
df['general_satisfaction'] = df['job_satisfaction'] + df['study_satisfaction']
toDrop = ['academic_pressure', 'work_pressure', 'job_satisfaction', 'study_satisfaction']

df.drop(columns=toDrop)

Unnamed: 0,name,gender,age,city,working_professional_or_student,profession,cgpa,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts_?,work/study_hours,financial_stress,family_history_of_mental_illness,depression,general_pressure,general_satisfaction
0,Pooja,Female,37,Ghaziabad,1,Teacher,,4,0,1,0,6,2,No,No,2.0,4.0
1,Reyansh,Male,60,Kalyan,1,Financial Analyst,,2,-1,0,1,0,4,Yes,No,4.0,3.0
2,Manvi,Female,42,Bhopal,1,Teacher,,2,0,1,0,0,2,No,No,2.0,3.0
3,Isha,Female,44,Thane,1,Teacher,,4,1,1,1,1,2,Yes,No,3.0,5.0
4,Aarav,Male,48,Indore,1,UX/UI Designer,,4,0,0,1,6,5,Yes,No,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551,Raghavendra,Male,25,Bangalore,1,Consultant,,2,1,0,1,12,3,Yes,No,1.0,5.0
2552,Pihu,Female,23,Pune,1,Teacher,,1,0,1,1,8,3,No,Yes,3.0,1.0
2553,Sara,Female,24,Srinagar,1,HR Manager,,1,0,0,1,4,4,No,No,1.0,4.0
2554,Eshita,Female,56,Bangalore,1,Business Analyst,,4,1,0,0,4,5,Yes,No,2.0,3.0


In [11]:
df.head()

Unnamed: 0,name,gender,age,city,working_professional_or_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,...,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts_?,work/study_hours,financial_stress,family_history_of_mental_illness,depression,general_pressure,general_satisfaction
0,Pooja,Female,37,Ghaziabad,1,Teacher,0.0,2.0,,0.0,...,7-8 hours,Moderate,MA,No,6,2,No,No,2.0,4.0
1,Reyansh,Male,60,Kalyan,1,Financial Analyst,0.0,4.0,,0.0,...,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No,4.0,3.0
2,Manvi,Female,42,Bhopal,1,Teacher,0.0,2.0,,0.0,...,5-6 hours,Moderate,M.Com,No,0,2,No,No,2.0,3.0
3,Isha,Female,44,Thane,1,Teacher,0.0,3.0,,0.0,...,7-8 hours,Healthy,MD,Yes,1,2,Yes,No,3.0,5.0
4,Aarav,Male,48,Indore,1,UX/UI Designer,0.0,4.0,,0.0,...,7-8 hours,Moderate,BE,Yes,6,5,Yes,No,4.0,3.0


In [12]:
df.isna().mean()

name                                     0.000000
gender                                   0.000000
age                                      0.000000
city                                     0.000000
working_professional_or_student          0.000000
profession                               0.263302
academic_pressure                        0.000000
work_pressure                            0.000000
cgpa                                     0.803599
study_satisfaction                       0.000000
job_satisfaction                         0.000000
sleep_duration                           0.000000
dietary_habits                           0.000000
degree                                   0.000000
have_you_ever_had_suicidal_thoughts_?    0.000000
work/study_hours                         0.000000
financial_stress                         0.000000
family_history_of_mental_illness         0.000000
depression                               0.000000
general_pressure                         0.000000


In [13]:
df['sleep_duration'].replace({
    "7-8 hours": 4,
    "Less than 5 hours": 1,
    "5-6 hours": 2,
    "More than 8 hours": 3
}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sleep_duration'].replace({
  df['sleep_duration'].replace({


In [14]:
df['sleep_duration'].value_counts()

sleep_duration
4    658
1    648
2    628
3    622
Name: count, dtype: int64

In [15]:
df['degree'].value_counts()

degree
Class 12    275
B.Com       115
B.Ed        112
MCA         108
BCA         103
MSc          95
MBA          95
BSc          94
BBA          92
BHM          90
BA           89
B.Arch       89
B.Pharm      88
M.Tech       85
M.Pharm      85
BE           84
ME           84
LLM          84
LLB          82
M.Ed         81
MHM          81
PhD          81
MA           79
MBBS         75
MD           74
B.Tech       71
M.Com        65
Name: count, dtype: int64

In [16]:
df['degree'].replace({
    "M.Com": "Master",
    "MD": 'Master',
    "MBBS": "Master",
    "MA": "Master",
    "MHM": "Master",
    "M.Ed": "Master",
    "ME": 'Master',
    "M.Pharm": "Master",
    "M.Tech": "Master",
    "MBA": "Master",
    "MSc": "Master",
    "MCA": "Master",
    "LLM": "Master",
    "PhD": "Master",
    
    # Masters above, Bacharel below
    
    "B.Tech": "Bacharel",
    "LLB": "Bacharel",
    "BE": "Bacharel",
    "B.Pharm": "Bacharel",
    "B.Arch": "Bacharel",
    "BA": "Bacharel",
    "BHM": "Bacharel",
    "BBA": "Bacharel",
    "BSc": "Bacharel",
    "BCA": "Bacharel",
    "B.Ed": "Bacharel",
    "B.Com": "Bacharel",
    "Class 12": "Bacharel"
}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['degree'].replace({


In [17]:
df['degree'].replace({
    "Master": 1,
    "Bacharel": 0
}, inplace=True)

  df['degree'].replace({


In [18]:
df['degree'].value_counts()

degree
0    1384
1    1172
Name: count, dtype: int64

In [19]:
df['work/study_hours'].value_counts()

work/study_hours
10    218
9     210
2     206
11    205
0     204
5     197
6     195
3     192
1     191
12    191
7     186
4     186
8     175
Name: count, dtype: int64

In [20]:
df['have_you_ever_had_suicidal_thoughts_?'].replace({
    "Yes": True,
    "No": False
}, inplace=True)

df['have_you_ever_had_suicidal_thoughts_?'] = df['have_you_ever_had_suicidal_thoughts_?'].astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['have_you_ever_had_suicidal_thoughts_?'].replace({
  df['have_you_ever_had_suicidal_thoughts_?'].replace({


In [21]:
df['have_you_ever_had_suicidal_thoughts_?'].value_counts()

have_you_ever_had_suicidal_thoughts_?
0    1307
1    1249
Name: count, dtype: int64

In [22]:
df['dietary_habits'].value_counts()

dietary_habits
Unhealthy    882
Healthy      842
Moderate     832
Name: count, dtype: int64

In [23]:
df['dietary_habits'] = df['dietary_habits'].replace({
    'Unhealthy': -1,
    'Healthy': 1,
    'Moderate': 0
})

  df['dietary_habits'] = df['dietary_habits'].replace({


In [26]:
df['general_satisfaction'].value_counts()

general_satisfaction
2.0    531
5.0    528
4.0    508
3.0    507
1.0    482
Name: count, dtype: int64

In [27]:
df.to_csv('halfTreatedDepression.csv')