In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

### Save males in age between 18 and 25 as our core group

In [3]:
df = pd.read_csv('student_depression_dataset.csv')
df = df[(df['Age'] >= 18) & (df['Age'] < 25)]
df = df[(df['Gender'] == 'Male')]
df.drop(columns=['id'], inplace=True)
df

Unnamed: 0,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
10,Male,24.0,Nagpur,Student,3.0,0.0,6.10,3.0,0.0,'5-6 hours',Moderate,'Class 12',Yes,11.0,1.0,Yes,1
18,Male,19.0,Chennai,Student,2.0,0.0,7.83,2.0,0.0,'7-8 hours',Unhealthy,'Class 12',No,6.0,3.0,No,0
23,Male,23.0,Thane,Student,1.0,0.0,8.59,4.0,0.0,'7-8 hours',Healthy,BHM,No,11.0,3.0,No,0
24,Male,18.0,Bangalore,Student,4.0,0.0,7.10,3.0,0.0,'More than 8 hours',Unhealthy,'Class 12',Yes,11.0,5.0,Yes,1
30,Male,18.0,Surat,Student,4.0,0.0,6.70,5.0,0.0,'Less than 5 hours',Moderate,'Class 12',Yes,5.0,4.0,Yes,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27879,Male,21.0,Kalyan,Student,1.0,0.0,9.21,3.0,0.0,'More than 8 hours',Unhealthy,M.Com,No,10.0,3.0,No,0
27880,Male,24.0,Nagpur,Student,3.0,0.0,8.54,3.0,0.0,'5-6 hours',Moderate,'Class 12',Yes,11.0,2.0,No,0
27884,Male,18.0,Meerut,Student,5.0,0.0,6.25,3.0,0.0,'7-8 hours',Unhealthy,'Class 12',No,11.0,2.0,Yes,1
27888,Male,19.0,Kolkata,Student,4.0,0.0,7.13,1.0,0.0,'More than 8 hours',Moderate,'Class 12',No,10.0,2.0,No,1


In [4]:
# check for missing values
df.isna().sum()

Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Depression
count,6741.0,6741.0,6741.0,6741.0,6741.0,6741.0,6741.0,6741.0
mean,21.105771,3.167779,0.001038,7.689846,2.875093,0.00089,7.328141,0.685358
std,2.050316,1.391328,0.065587,1.419761,1.382963,0.05167,3.601684,0.464407
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,2.0,0.0,6.38,2.0,0.0,5.0,0.0
50%,21.0,3.0,0.0,7.83,3.0,0.0,8.0,1.0
75%,23.0,4.0,0.0,8.85,4.0,0.0,10.0,1.0
max,24.0,5.0,5.0,10.0,5.0,4.0,12.0,1.0


In [6]:
df.Degree.unique()

array(["'Class 12'", 'BHM', 'MCA', 'MA', 'MSc', 'B.Ed', 'B.Com', 'M.Tech',
       'BSc', 'BA', 'MD', 'B.Pharm', 'B.Tech', 'M.Com', 'M.Pharm',
       'B.Arch', 'BCA', 'BE', 'MBA', 'LLM', 'M.Ed', 'BBA', 'MBBS', 'LLB',
       'MHM', 'Others', 'PhD', 'ME'], dtype=object)

In [7]:
df.Profession.unique()

array(['Student', "'Civil Engineer'", 'Teacher', 'Manager', 'Lawyer',
       'Architect'], dtype=object)

In [8]:
df.columns

Index(['Gender', 'Age', 'City', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')

In [9]:
## for test purposes removing some columns
X = df[['Age', 'Profession', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Financial Stress']]
y = df['Depression']

In [10]:
X.isna().sum()

Age                   0
Profession            0
Work Pressure         0
Study Satisfaction    0
Job Satisfaction      0
Sleep Duration        0
Financial Stress      0
dtype: int64

In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = ohe.fit_transform(X[['Profession', 'Sleep Duration']])
X_encoded = pd.DataFrame(X_encoded, columns=ohe.get_feature_names_out(['Profession', 'Sleep Duration']), index=X.index)


X = pd.concat([X.drop(columns=['Profession', 'Sleep Duration']), X_encoded], axis=1)
X

Unnamed: 0,Age,Work Pressure,Study Satisfaction,Job Satisfaction,Financial Stress,Profession_Architect,Profession_Lawyer,Profession_Manager,Profession_Student,Profession_Teacher,Sleep Duration_'7-8 hours',Sleep Duration_'Less than 5 hours',Sleep Duration_'More than 8 hours',Sleep Duration_Others
10,24.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
18,19.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
23,23.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
24,18.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
30,18.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27879,21.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
27880,24.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
27884,18.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
27888,19.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.18756483393363121
R^2 Score: 0.14956056100225734


## Вывод линрегр плохая модель 

In [20]:
X['Depression'] = y
X.to_csv('male_18-25_students_depression.csv', index=False)