In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable
import scipy.stats as stats
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score
import warnings  

# Ignore all warnings  
warnings.filterwarnings('ignore')

sns.set(style="whitegrid")
%matplotlib inline

: 

In [None]:
# Load the dataset 
#df = pd.read_csv('../../../raw/student_depression_dataset.csv')
import os

# Get the project root based on the notebook's location
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../'))  # go up to project root

# Create a path to your dataset
csv_path = os.path.join(project_root, 'Data', 'raw', 'student_depression_dataset.csv')

# Load it
import pandas as pd
df = pd.read_csv(csv_path)


In [ ]:
df.head()# print first 5 rows of the dataset
    

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [ ]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [ ]:
df.isnull().sum()

id                                       0
Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

In [ ]:
# To see statistics for all columns, including object (categorical) columns, use:
df.describe(include='all')



Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
count,27901.0,27901,27901.0,27901,27901,27901.0,27901.0,27901.0,27901.0,27901.0,27901,27901,27901,27901,27901.0,27901.0,27901,27901.0
unique,,2,,52,14,,,,,,5,4,28,2,,6.0,2,
top,,Male,,Kalyan,Student,,,,,,'Less than 5 hours',Unhealthy,'Class 12',Yes,,5.0,No,
freq,,15547,,1570,27870,,,,,,8310,10317,6080,17656,,6715.0,14398,
mean,70442.149421,,25.8223,,,3.141214,0.00043,7.656104,2.943837,0.000681,,,,,7.156984,,,0.585499
std,40641.175216,,4.905687,,,1.381465,0.043992,1.470707,1.361148,0.044394,,,,,3.707642,,,0.492645
min,2.0,,18.0,,,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,,0.0
25%,35039.0,,21.0,,,2.0,0.0,6.29,2.0,0.0,,,,,4.0,,,0.0
50%,70684.0,,25.0,,,3.0,0.0,7.77,3.0,0.0,,,,,8.0,,,1.0
75%,105818.0,,30.0,,,4.0,0.0,8.92,4.0,0.0,,,,,10.0,,,1.0


In [ ]:
(df == 0).sum()

id                                           0
Gender                                       0
Age                                          0
City                                         0
Profession                                   0
Academic Pressure                            9
Work Pressure                            27898
CGPA                                         9
Study Satisfaction                          10
Job Satisfaction                         27893
Sleep Duration                               0
Dietary Habits                               0
Degree                                       0
Have you ever had suicidal thoughts ?        0
Work/Study Hours                          1700
Financial Stress                             0
Family History of Mental Illness             0
Depression                               11565
dtype: int64

In [ ]:


def value_counts_pretty(series, column_name="Value"):
    value_counts = series.value_counts(dropna=False)
    percentages = series.value_counts(normalize=True, dropna=False) * 100

    table = PrettyTable()
    table.field_names = [column_name, "Count", "Percentage"]
    table.align[column_name] = "c"
    table.align["Count"] = "r"
    table.align["Percentage"] = "r"

    for value, count in value_counts.items():
        display_value = "NaN/Missing" if pd.isna(value) else str(value)
        pct = percentages[value]
        table.add_row([display_value, count, f"{pct:.2f}%"])
    return table

colsJob = ['Work Pressure', 'Job Satisfaction']
for col in colsJob:
    print(f"\n Value counts for '{col}':")
    print(value_counts_pretty(df[col], col))


 Value counts for 'Work Pressure':
+---------------+-------+------------+
| Work Pressure | Count | Percentage |
+---------------+-------+------------+
|      0.0      | 27898 |     99.99% |
|      5.0      |     2 |      0.01% |
|      2.0      |     1 |      0.00% |
+---------------+-------+------------+

 Value counts for 'Job Satisfaction':
+------------------+-------+------------+
| Job Satisfaction | Count | Percentage |
+------------------+-------+------------+
|       0.0        | 27893 |     99.97% |
|       2.0        |     3 |      0.01% |
|       4.0        |     2 |      0.01% |
|       1.0        |     2 |      0.01% |
|       3.0        |     1 |      0.00% |
+------------------+-------+------------+
