## Load Data

In [1]:
import pandas as pd

train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

## Data Preprocessing

In [2]:
train_data

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,140695,Vidya,Female,18.0,Ahmedabad,Working Professional,,,5.0,,,4.0,5-6 hours,Unhealthy,Class 12,No,2.0,4.0,Yes,1
140696,140696,Lata,Female,41.0,Hyderabad,Working Professional,Content Writer,,5.0,,,4.0,7-8 hours,Moderate,B.Tech,Yes,6.0,5.0,Yes,0
140697,140697,Aanchal,Female,24.0,Kolkata,Working Professional,Marketing Manager,,3.0,,,1.0,More than 8 hours,Moderate,B.Com,No,4.0,4.0,No,0
140698,140698,Prachi,Female,49.0,Srinagar,Working Professional,Plumber,,5.0,,,2.0,5-6 hours,Moderate,ME,Yes,10.0,1.0,No,0


In [3]:
train_data.columns

Index(['id', 'Name', 'Gender', 'Age', 'City',
       'Working Professional or Student', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')

In [4]:
train_data.shape, test_data.shape

((140700, 20), (93800, 19))

In [5]:
train_y = train_data.pop("Depression")

### Combine both Train data & Test data into single dataframe for easier preprocessing & cleaning

In [6]:
df = pd.concat([train_data, test_data])
df.shape

(234500, 19)

In [7]:
df = df.rename(
    columns={
        "Have you ever had suicidal thoughts ?": "Suicidal Thoughts",
        "Family History of Mental Illness": "Family History"
    }
)

Working Professional or Student is redundant as we've the Profession column  
Updating Profession as Student for students.

In [8]:
df.loc[df["Working Professional or Student"] == "Student", "Profession"] = "Student"

In [9]:
df.isna()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Suicidal Thoughts,Work/Study Hours,Financial Stress,Family History
0,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93795,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False
93796,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False
93797,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False
93798,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False


In [10]:
df["CGPA"].fillna(-1, inplace=True)
df["Degree"].fillna("Other", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["CGPA"].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Degree"].fillna("Other", inplace=True)


Academic Pressure & Study Satisfaction are only available for Students, These can be update for Work Pressure & Job Satisfaction

In [11]:
df["Work Pressure"].fillna(df["Academic Pressure"], inplace=True)
df["Job Satisfaction"].fillna(df["Study Satisfaction"], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Work Pressure"].fillna(df["Academic Pressure"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Job Satisfaction"].fillna(df["Study Satisfaction"], inplace=True)


Marking the features which have less than 5 examples to Other

In [42]:
def mark_value_as_other(feature: str, threshold: int, value_to_update: str = "Other"):
    print(f"Updating for Feature: {feature}")
    values = df[feature].value_counts()
    df[feature] = df[feature].replace(values[values < threshold].index, value_to_update)

for feature in ["Profession", "Degree", "City", "Dietary Habits"]:
    mark_value_as_other(feature=feature, threshold=5)

Updating for Feature: Profession
Updating for Feature: Degree
Updating for Feature: City
Updating for Feature: Dietary Habits


In [13]:
df.loc[df["Degree"] == "M"]

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Suicidal Thoughts,Work/Study Hours,Financial Stress,Family History
20659,20659,Nishant,Male,19.0,Kolkata,Student,Student,3.0,3.0,7.53,1.0,1.0,Less than 5 hours,Moderate,M,No,0.0,5.0,Yes
55299,55299,Shlok,Male,52.0,Meerut,Working Professional,Content Writer,,3.0,-1.0,,4.0,More than 8 hours,Healthy,M,No,7.0,5.0,No
111586,111586,Shrey,Male,43.0,Indore,Working Professional,,,2.0,-1.0,,5.0,7-8 hours,Healthy,M,Yes,11.0,2.0,Yes
44486,185186,Aishwarya,Female,26.0,Ludhiana,Student,Student,4.0,4.0,5.64,3.0,3.0,More than 8 hours,Healthy,M,No,6.0,2.0,No
46769,187469,Vivan,Male,28.0,Nashik,Student,Student,4.0,4.0,8.91,1.0,1.0,5-6 hours,Healthy,M,Yes,6.0,3.0,No


Convert to Numeric values

In [14]:
df["Work/Study Hours"] = pd.to_numeric(df["Work/Study Hours"], errors="coerce")

In [15]:
df["Suicidal Thoughts"] = df["Suicidal Thoughts"].map({"Yes": 1, "No": 0})

In [16]:
df["Family History"] = df["Family History"].map({"No": 0, "Yes": 1})

In [17]:
df["Gender"] = df["Gender"].map({"Female": 0, "Male": 1})

In [18]:
df["Sleep Duration"] = df["Sleep Duration"].map({
    "3-4 hours": 4,
    "4-5 hours": 4,
    "Less than 5 hours": 4,
    "5-6 hours": 5,
    "6-7 hours": 6,
    "7-8 hours": 7,
    "More than 8 hours": 8
})

Remove the columns which are not required

In [19]:
df = df.drop(["id", "Working Professional or Student", "Academic Pressure", "Study Satisfaction"], axis=1).reset_index(drop=True)

Mark whether Job is Desk type or Field Work

In [20]:
office_based = {
    'Chef': False,
    'Teacher': False,
    'Student': False,
    'Business Analyst': True,
    'Financial Analyst': True,
    'Chemist': False,
    'Electrician': False,
    'Software Engineer': True,
    'Data Scientist': True,
    'Plumber': False,
    'Marketing Manager': True,
    'Accountant': True,
    'Entrepreneur': True,  # can be True too
    'HR Manager': True,
    'UX/UI Designer': True,
    'Content Writer': True,
    'Educational Consultant': True,
    'Civil Engineer': False,
    'Manager': True,
    'Pharmacist': False,
    'Architect': True,
    'Mechanical Engineer': False,
    'Customer Support': True,
    'Consultant': True,
    'Judge': False,
    'Researcher': True,
    'Pilot': False,
    'Graphic Designer': True,
    'Travel Consultant': True,
    'Digital Marketer': True,
    'Lawyer': False, # can be True too
    'Research Analyst': True,
    'Sales Executive': True,
    'Doctor': False,
    'Unemployed': False,
    'Investment Banker': True,
    'Other': False,
    'Academic': True,
    'Working Professional': True
}
df["Desk Job"] = df["Profession"].map(office_based)

In [21]:
df.sample(10)

Unnamed: 0,Name,Gender,Age,City,Profession,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Suicidal Thoughts,Work/Study Hours,Financial Stress,Family History,Desk Job
141643,Kunal,1,21.0,Other,Student,5.0,9.96,3.0,8.0,Healthy,BCA,1,10.0,3.0,1,False
16537,Kavya,0,37.0,Faridabad,Teacher,5.0,-1.0,3.0,8.0,Unhealthy,LLB,0,6.0,2.0,0,False
66994,Hrithik,1,37.0,Kalyan,Judge,2.0,-1.0,2.0,4.0,Moderate,LLB,1,5.0,4.0,0,False
105677,Tanisha,0,58.0,Pune,Teacher,1.0,-1.0,3.0,4.0,Moderate,BA,1,9.0,5.0,0,False
41964,Kriti,0,24.0,Thane,Student,5.0,8.94,4.0,7.0,Healthy,Class 12,0,11.0,4.0,0,False
215520,Harshil,1,39.0,Thane,Travel Consultant,5.0,-1.0,4.0,8.0,Moderate,MHM,0,5.0,3.0,0,True
156683,Ayaan,1,31.0,Chennai,Pharmacist,4.0,-1.0,2.0,4.0,Healthy,B.Pharm,0,8.0,5.0,0,False
71738,Naina,0,56.0,Thane,Teacher,1.0,-1.0,4.0,5.0,Moderate,LLM,1,3.0,2.0,1,False
101640,Anvi,0,48.0,Vadodara,,1.0,-1.0,1.0,5.0,Healthy,Class 12,0,9.0,4.0,1,
127197,Tanisha,0,57.0,Mumbai,Architect,1.0,-1.0,3.0,7.0,Moderate,B.Arch,1,10.0,5.0,0,True


## Data Visualization

In [43]:
val = df["Dietary Habits"].value_counts()

val[val < 10].index

Index(['No'], dtype='object', name='Dietary Habits')

In [45]:
df.loc[df["Dietary Habits"] == "Other"]

Unnamed: 0,Name,Gender,Age,City,Profession,Work Pressure,CGPA,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Suicidal Thoughts,Work/Study Hours,Financial Stress,Family History,Desk Job
2943,Naina,0,24.0,Patna,Teacher,3.0,-1.0,2.0,4.0,Other,LLB,1,3.0,5.0,1,False
4495,Aarav,1,31.0,Vadodara,Researcher,4.0,-1.0,2.0,7.0,Other,MCA,1,5.0,4.0,0,True
20779,Nikita,0,39.0,Mumbai,Financial Analyst,3.0,-1.0,2.0,7.0,Other,MBA,0,6.0,1.0,1,True
21736,Divya,0,24.0,Hyderabad,Teacher,5.0,-1.0,2.0,8.0,Other,BCA,1,5.0,5.0,0,False
28610,Aakash,1,23.0,Meerut,Student,3.0,9.98,5.0,4.0,Other,B.Com,1,4.0,5.0,0,False
31489,Harshil,1,43.0,Srinagar,Lawyer,1.0,-1.0,3.0,8.0,Other,LLM,1,11.0,5.0,0,False
34623,Dev,1,18.0,Vadodara,Student,1.0,6.05,5.0,7.0,Other,Class 12,0,4.0,4.0,0,False
41636,Pallavi,0,21.0,Jaipur,Student,2.0,7.28,4.0,4.0,Other,B.Pharm,1,0.0,3.0,1,False
42616,Shreya,0,32.0,Ahmedabad,Student,4.0,9.54,4.0,5.0,Other,B.Ed,1,4.0,2.0,0,False
56866,Shaurya,1,56.0,Meerut,Pharmacist,2.0,-1.0,3.0,4.0,Other,B.Pharm,1,11.0,5.0,1,False
