In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [8]:
df = pd.read_csv("student_depression_dataset.csv")

In [9]:
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

(None,
    id  Gender   Age           City Profession  Academic Pressure  \
 0   2    Male  33.0  Visakhapatnam    Student                5.0   
 1   8  Female  24.0      Bangalore    Student                2.0   
 2  26    Male  31.0       Srinagar    Student                3.0   
 3  30  Female  28.0       Varanasi    Student                3.0   
 4  32  Female  25.0         Jaipur    Student                4.0   
 
    Work Pressure  CGPA  Study Satisfaction  Job Satisfaction  \
 0            0.0  8.97                 2.0               0.0   
 1            0.0  5.90                 5.0               0.0   
 2            0.0  7.03                 5.0               0.0   
 3            0.0  5.59                 2.0               0.0   
 4            0.0  8.13                 3.0               0.0   
 
         Sleep Duration Dietary Habits   Degree  \
 0          '5-6 hours'        Healthy  B.Pharm   
 1          '5-6 hours'       Moderate      BSc   
 2  'Less than 5 hours'        H

In [10]:
# Предобработка
df.drop(columns=['id'], inplace=True)  # Удаление лишнего
df['Sleep Duration'] = df['Sleep Duration'].str.replace("'", "").str.strip()
df['Financial Stress'] = df['Financial Stress'].replace('?', np.nan)
df['Financial Stress'] = df['Financial Stress'].astype(float)
df['Financial Stress'] = df['Financial Stress'].fillna(df['Financial Stress'].median())

In [None]:
valid_cities = df['City'].value_counts().loc[lambda x: x > 50].index
df = df[df['City'].isin(valid_cities)]
# Удаление мусорных значений из "City"

In [None]:
categorical_cols = df.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# One-hot кодирование категориальных признаков

In [13]:
X = df_encoded.drop("Depression", axis=1)
y = df_encoded["Depression"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)

(22300, 87) (5575, 87)


In [15]:
knn_initial = KNeighborsClassifier(n_neighbors=5)
knn_initial.fit(X_train, y_train)
y_pred_initial = knn_initial.predict(X_test)

In [16]:
# Оценка базовой модели
print("Базовая модель (K=5)")
print("Accuracy:", accuracy_score(y_test, y_pred_initial))
print("Classification report:\n", classification_report(y_test, y_pred_initial))

Базовая модель (K=5)
Accuracy: 0.8052017937219731
Classification report:
               precision    recall  f1-score   support

           0       0.79      0.73      0.76      2311
           1       0.82      0.86      0.84      3264

    accuracy                           0.81      5575
   macro avg       0.80      0.79      0.80      5575
weighted avg       0.80      0.81      0.80      5575



In [17]:
param_grid = {'n_neighbors': list(range(1, 21))}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

random_search = RandomizedSearchCV(KNeighborsClassifier(), param_grid, n_iter=10, cv=cv, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

In [19]:
print("GridSearchCV: Лучшее K =", grid_search.best_params_['n_neighbors'], 
      "Средняя точность:", grid_search.best_score_)
print("RandomizedSearchCV: Лучшее K =", random_search.best_params_['n_neighbors'], 
      "Средняя точность:", random_search.best_score_)

GridSearchCV: Лучшее K = 20 Средняя точность: 0.8140807174887893
RandomizedSearchCV: Лучшее K = 18 Средняя точность: 0.8125112107623318


In [20]:
best_k = grid_search.best_params_['n_neighbors']
knn_optimized = KNeighborsClassifier(n_neighbors=best_k)
knn_optimized.fit(X_train, y_train)
y_pred_optimized = knn_optimized.predict(X_test)

In [21]:
print(f"\nОптимизированная модель (K={best_k})")
print("Accuracy:", accuracy_score(y_test, y_pred_optimized))
print("Classification report:\n", classification_report(y_test, y_pred_optimized))


Оптимизированная модель (K=20)
Accuracy: 0.8224215246636771
Classification report:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78      2311
           1       0.84      0.87      0.85      3264

    accuracy                           0.82      5575
   macro avg       0.82      0.81      0.82      5575
weighted avg       0.82      0.82      0.82      5575

