In [2]:
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as mlt

In [3]:
df = pd.read_csv('StudentDepressionDataset.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

## Missing data replacement


In [6]:
for col in df.columns:
    missing_data = df[col].isna().sum()
    missing_percent = missing_data/len(df)*100
    print(f"Column {col}: has {missing_percent}%")

Column id: has 0.0%
Column Gender: has 0.0%
Column Age: has 0.0%
Column City: has 0.0%
Column Profession: has 0.0%
Column Academic Pressure: has 0.0%
Column Work Pressure: has 0.0%
Column CGPA: has 0.0%
Column Study Satisfaction: has 0.0%
Column Job Satisfaction: has 0.0%
Column Sleep Duration: has 0.0%
Column Dietary Habits: has 0.0%
Column Degree: has 0.0%
Column Have you ever had suicidal thoughts ?: has 0.0%
Column Work/Study Hours: has 0.0%
Column Financial Stress: has 0.01075230278484642%
Column Family History of Mental Illness: has 0.0%
Column Depression: has 0.0%


In [7]:
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [8]:
df = df.rename(columns={
    'Have you ever had suicidal thoughts ?' : 'suicidal_thoughts',
    'Family History of Mental Illness': 'family_mental_illness'
})
df.columns = [col.lower().replace(' ', '_').replace('/', '_') for col in df.columns]
df.columns

Index(['id', 'gender', 'age', 'city', 'profession', 'academic_pressure',
       'work_pressure', 'cgpa', 'study_satisfaction', 'job_satisfaction',
       'sleep_duration', 'dietary_habits', 'degree', 'suicidal_thoughts',
       'work_study_hours', 'financial_stress', 'family_mental_illness',
       'depression'],
      dtype='object')

## NaN value make up 0.011% so we can drop it

In [16]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27898 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     27898 non-null  int64  
 1   gender                 27898 non-null  object 
 2   age                    27898 non-null  float64
 3   city                   27898 non-null  object 
 4   profession             27898 non-null  object 
 5   academic_pressure      27898 non-null  float64
 6   work_pressure          27898 non-null  float64
 7   cgpa                   27898 non-null  float64
 8   study_satisfaction     27898 non-null  float64
 9   job_satisfaction       27898 non-null  float64
 10  sleep_duration         27898 non-null  object 
 11  dietary_habits         27898 non-null  object 
 12  degree                 27898 non-null  object 
 13  suicidal_thoughts      27898 non-null  object 
 14  work_study_hours       27898 non-null  float64
 15  financi

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Encode categorical values

In [21]:
df.drop(columns=["id", "profession"], inplace=True)

In [23]:
label_encoders = {}
for col in df.select_dtypes(include = "object").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [25]:
X = df.drop(columns="depression")
y = df["depression"]

In [27]:
print(X)

       gender   age  city  academic_pressure  work_pressure  cgpa  \
0           1  33.0    51                5.0            0.0  8.97   
1           0  24.0     3                2.0            0.0  5.90   
2           1  31.0    44                3.0            0.0  7.03   
3           0  28.0    49                3.0            0.0  5.59   
4           0  25.0    16                4.0            0.0  8.13   
...       ...   ...   ...                ...            ...   ...   
27896       0  27.0    45                5.0            0.0  5.75   
27897       1  27.0    25                2.0            0.0  9.40   
27898       1  31.0     9                3.0            0.0  6.61   
27899       0  18.0    25                5.0            0.0  6.88   
27900       1  27.0    38                4.0            0.0  9.24   

       study_satisfaction  job_satisfaction  sleep_duration  dietary_habits  \
0                     2.0               0.0               0               0   
1            

In [29]:
print(y)

0        1
1        0
2        0
3        1
4        0
        ..
27896    0
27897    0
27898    0
27899    1
27900    1
Name: depression, Length: 27898, dtype: int64


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
print(X_train_scaled)


[[ 0.89147812  1.05821607 -0.56879329 ...  0.7668122   1.29532108
  -0.97149067]
 [ 0.89147812 -1.39655973  1.0806509  ... -1.11739351 -1.48793981
   1.02934596]
 [-1.12173253 -0.57830113 -0.93533644 ... -0.30987678  1.29532108
  -0.97149067]
 ...
 [ 0.89147812 -1.60112438 -0.32443119 ... -0.30987678  1.29532108
   1.02934596]
 [ 0.89147812  0.64908677  0.65301722 ...  0.7668122   1.29532108
  -0.97149067]
 [-1.12173253 -0.57830113  1.0806509  ...  0.22846771  1.29532108
   1.02934596]]


### Feature scaling is the process of transforming your input variables into the similar range because these differences can cause problems for some machine learning models rely on distance like KNN and Logistic Regression

# USING LIBRARY

In [39]:
knn = KNeighborsClassifier(n_neighbors=5)
log_reg = LogisticRegression()
dt = DecisionTreeClassifier(random_state=42)

In [41]:
ensemble = VotingClassifier(estimators=[
    ('knn', knn),
    ('lr', log_reg),
    ('dt', dt)
], voting='hard')

In [43]:
ensemble.fit(X_train_scaled, y_train)

In [45]:
y_pred = ensemble.predict(X_test_scaled)

In [46]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report: \n", classification_report(y_test, y_pred))
      

Accuracy: 0.8365591397849462

Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.77      0.80      2904
           1       0.84      0.89      0.86      4071

    accuracy                           0.84      6975
   macro avg       0.84      0.83      0.83      6975
weighted avg       0.84      0.84      0.84      6975



# Self_write

## Decision Tree

In [49]:
from collections import Counter

In [50]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

In [51]:
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=4, number_features=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.number_features = number_features
        self.root = None

    def fit(self, X, y):
        self.number_features = X.shape[1] if not self.number_features else min(X.shape[1], self.number_features)
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        if depth >= self.max_depth or num_labels == 1 or num_samples < self.min_samples_split:
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(num_features, self.number_features, replace=False)
        best_feature, best_thresh = self._best_split(X, y, feat_idxs)

        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feature, best_thresh, left, right)

    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None

        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold
        return split_idx, split_thresh

    def _information_gain(self, y, X_column, threshold):
        parent_entropy = self._entropy(y)
        left_idxs, right_idxs = self._split(X_column, threshold)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        return parent_entropy - child_entropy

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        counter = Counter(y)
        return counter.most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
        

# Logistic Regression

In [53]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class LogisticRegression:
    def __init__(self, lr=0.001, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            linear_model = np.dot(X, self.weights) + self.bias
            predictions = sigmoid(linear_model)

            dw = (1 / n_samples) * np.dot(X.T, (predictions - y))
            db = (1 / n_samples) * np.sum(predictions - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_pred]

# KNN

In [61]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return [self._predict(x) for x in X]

    def _predict(self, x):
        distances = np.linalg.norm(self.X_train - x, axis=1)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest).most_common(1)
        return most_common[0][0]

# Emsemble Classifiers

In [64]:
class EnsembleClassifier:
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)

    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        majority_votes = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=predictions)
        return majority_votes


## Initialize the model



In [67]:
dt = DecisionTree()
lr = LogisticRegression()
knn = KNN(k=5)

X_train = np.array(X_train_scaled)
y_train = np.array(y_train)
X_test = np.array(X_test_scaled)
y_test = np.array(y_test)

ensemble_self_build = EnsembleClassifier([dt,lr,knn])
ensemble_self_build.fit(X_train, y_train)
y_pred_1 = ensemble_self_build.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_1))
print("\nClassification Report: \n", classification_report(y_test, y_pred_1))

Accuracy: 0.8410035842293907

Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.77      0.80      2904
           1       0.84      0.89      0.87      4071

    accuracy                           0.84      6975
   macro avg       0.84      0.83      0.83      6975
weighted avg       0.84      0.84      0.84      6975

