<a href="https://colab.research.google.com/github/ReVuz/Machine_Learning/blob/main/Depression_detection_with_neural_nets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'depression:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F459013%2F864483%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240420%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240420T143854Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db988d33af4eb676fe68e119db1f569b5f7b8e2bd301fd20695ddd52bb409e9c65bcc1871189fb85ce020f189d259164adb138dbd6b27c2c68df30e089ee8746cbd8f260e1647c1d98db4b8ea186d1e41a0df79ba2a6faa0a0b1554f219a3840a54c4e8088ed450be4da4807c4d0b4ed8978743651547fdb63c496f49ae1db76b230ca5e137f387ec90c7448173b9c77ad6abbecde3c19d3585a0ee234f8c735db6252b3ad1600736d8f568d85752648c6591210163099aa169864689e397032b9cce9398daf1c3dfe1b43b0ca3ff650e31ccc82bbbe8e7eb4a13d48f124e97d28b2afd36c31a291d47c99b25e33b57bfe443840471cea1a7c9d65ff38d4a66cb'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt

np.random.seed(1)

In [None]:
def print_results(y_true, y_pred):
    print(confusion_matrix(y_true, y_pred))
    print('F1-score:', f1_score(y_true, y_pred))

In [None]:
# If provided, it should be a list or array containing specific values
#of the hyperparameter for which the validation curve is to be plotted.
#If not provided, the function extracts the hyperparameter values from 'model_grid'.

def plot_validation_curve(model_grid, param_name, params=None):

    results_df = pd.DataFrame(model_grid.cv_results_)

    if params == None:
        plt.plot(results_df['param_'+param_name], results_df['mean_test_score'])
    else:
        plt.plot(params, results_df['mean_test_score'])

    plt.xlabel(param_name)
    plt.ylabel('Test F1 score')
    plt.title('Validation curve')
    plt.show()

In [None]:
df = pd.read_csv('/kaggle/input/depression/b_depressed.csv')
df.head()

In [None]:
df.describe()

In [None]:
#droping missing values
df_1 = df.dropna()
#removing columns
df_2 = df_1.drop(['Survey_id', 'depressed'], axis=1)
#adding dummy values to categorical columns
df_3 = pd.get_dummies(df_2, columns=['Ville_id', 'education_level'])
df_3.head()

In [None]:
#independent variable
X = df_3
#dependent variable
y = df_1['depressed']
#dividing into train-test data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
#Scaling ensures that all features are on a comparable scale and have similar ranges

#Used for standardization of features by removing the mean and scaling to unit variance.
scaler = StandardScaler()

#computes the mean and standard deviation necessary for standardization
scaler.fit(X_train)

# transform (standardize) the features using the mean and sd from training set.
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)

In [None]:
#MLP can learn complex patterns in data, making it suitable for tasks like classification and regression.
#Multi-layer Perceptron : Neural Network Architecture that has input, output and hidden layers

#Limited-memory BFGS algorithm
mlp = MLPClassifier(solver='lbfgs')
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_valid)
print_results(y_valid, y_pred) # log_reg: ~0.2

In [None]:
# splitting the scaled data into training and validation sets
#ensures that all features have a mean of approximately 0 and a standard deviation of 1.
X_sc = scaler.fit_transform(X)
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X_sc, y, test_size=0.25, random_state=1)

In [None]:
#Training MLP

mlp = MLPClassifier(solver='lbfgs')
mlp.fit(X_train1, y_train1)

y_pred1 = mlp.predict(X_valid1)
print_results(y_valid1, y_pred1) # log_reg: ~0.2

In [None]:
#one hideen layer with 200 neurons
#alpha = 0.1  : L2 penalty (regularization term) parameter.

mlp_2 = MLPClassifier(hidden_layer_sizes=(200,), solver='lbfgs', max_iter=400, alpha=0.1)
mlp_2.fit(X_train, y_train)

y_pred = mlp.predict(X_valid)
print_results(y_pred, y_valid) # log_reg: ~0.2

In [None]:
mlp_2.n_iter_

In [None]:
#Class imbalance occurs when the distribution of classes in your dataset is uneven, with one class significantly outnumbering one or more other classes.
#randomly replicates minority class samples to balance the class distribution in the training data
#Creating an Instance of Random Over Sampler
ros = RandomOverSampler()

X_ros, y_ros = ros.fit_sample(X_train, y_train)

In [None]:
#Training MLP

mlp_3 = MLPClassifier(hidden_layer_sizes=(200,), solver='lbfgs', max_iter=400, alpha=0.1)
mlp_3.fit(X_ros, y_ros)

y_pred = mlp_3.predict(X_valid)
print_results(y_pred, y_valid) # log_reg: ~0.2

In [None]:
mlp_4 = MLPClassifier(hidden_layer_sizes=(100, 50, 20), solver='lbfgs', alpha=0.001)
mlp_4.fit(X_ros, y_ros)

y_pred = mlp_4.predict(X_valid)
print(confusion_matrix(y_valid, y_pred))
print('F1-score:', f1_score(y_valid, y_pred)) # log_reg: ~0.2

In [None]:
#Pipeline to standardize the features and then apply mlp
#Pipelines automate repetitive tasks, making the machine learning workflow more efficient
scaler = StandardScaler()
mlp = MLPClassifier(solver='lbfgs')
model = Pipeline([('scaler', scaler), ('mlp', mlp)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))

In [None]:
#Hyperparameter Tuning
#generates 10 values for alpha logarithmically spaced between 10^-4 and 10^4.
param_grid = {'mlp__alpha': np.logspace(-4, 4, 10)}
model_grid = GridSearchCV(model, param_grid, cv=5, scoring='f1',
                          n_jobs=-1) #n_jobs = -1 : Utilizes all available processors for parallelizing the grid search computation.
model_grid.fit(X_train, y_train)

In [None]:
plot_validation_curve(model_grid, 'mlp__alpha')

In [None]:
print('Best (hyper)parameters:', model_grid.best_params_)
print('Best score:', model_grid.best_score_)

In [None]:
y_pred = model_grid.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))

In [None]:
X_ros, y_ros = ros.fit_sample(X_train, y_train)
model_grid.fit(X_ros, y_ros)

In [None]:
print('Best (hyper)parameters:', model_grid.best_params_)
print('Best score:', model_grid.best_score_)

In [None]:
y_pred = model_grid.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred))

In [None]:
#hyperparameter tuning for mlp__activation

param_grid = {'mlp__activation': ['logistic', 'tanh', 'relu']}
#GridSearchCV is a method provided by scikit-learn that performs an
#exhaustive search over a specified parameter grid to find the best combination
#of hyperparameters for a given estimator (model). It's a useful tool for
#hyperparameter tuning in machine learning.

model_grid = GridSearchCV(model, param_grid, cv=5, scoring='f1',
                          n_jobs=-1)
model_grid.fit(X_train, y_train)

In [None]:
plot_validation_curve(model_grid, 'mlp__activation')

In [None]:
print('Best (hyper)parameters:', model_grid.best_params_)
print('Best score:', model_grid.best_score_)

In [None]:
y_pred = model_grid.best_estimator_.predict(X_test)
print_results(y_test, y_pred)

In [None]:
#grid of hyperparameters to search over
param_grid = {'mlp__hidden_layer_sizes': [(i, ) for i in range(20, 500, 20)]}
model_grid = GridSearchCV(model, param_grid, cv=5, scoring='f1',
                          n_jobs=-1) # n_jobs=-1 all available processors
model_grid.fit(X_train, y_train)

In [None]:
plot_validation_curve(model_grid, 'mlp__hidden_layer_sizes',
                      [i for i in range(20, 500, 20)])

In [None]:
print('Best (hyper)parameters:', model_grid.best_params_)
print('Best score:', model_grid.best_score_)

In [None]:
y_pred = model_grid.best_estimator_.predict(X_test)
print_results(y_test, y_pred)

In [None]:
param_grid = {'mlp__warm_start': [True, False]}
model_grid = GridSearchCV(model, param_grid, cv=5, scoring='f1',
                          n_jobs=-1)
model_grid.fit(X_train, y_train)

In [None]:
print('Best (hyper)parameters:', model_grid.best_params_)
print('Best score:', model_grid.best_score_)

In [None]:
y_pred = model_grid.best_estimator_.predict(X_test)
print_results(y_test, y_pred)

In [None]:
#hyperparameter tuning for various configurations of hidden layer sizes

hidden = [(100,), (100, 50), (100, 50, 20), (50, 50), (50, 50, 50), (50, 30, 30, 20)]
# grid of hyperparameters to search over
param_grid = {'mlp__hidden_layer_sizes': hidden}
model_grid = GridSearchCV(model, param_grid, cv=5, scoring='f1',
                          n_jobs=-1)
model_grid.fit(X_train, y_train)

In [None]:
plot_validation_curve(model_grid, 'mlp__hidden_layer_sizes',
                      [str(x) for x in hidden])

In [None]:
print('Best (hyper)parameters:', model_grid.best_params_)
print('Best score:', model_grid.best_score_)

In [None]:
y_pred = model_grid.best_estimator_.predict(X_test)
print_results(y_test, y_pred)

In [None]:
print(df['depressed'].value_counts())
print(len(df['depressed']))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Explore the relationship between depression and age
plt.figure(figsize=(10, 6))
sns.distplot(df['Age'][df['depressed'] == 1], kde=False, hist=True, label='Depressed')
# sns.distplot(df['Age'][df['depressed'] == 0], kde=True, hist=True, label='Not Depressed')
plt.title('Age Distribution by Depression Status')
plt.xlabel('Age')
plt.ylabel('Density')
plt.legend()
plt.show()

In [None]:
# Analyze relationship between 'depressed' and categorical features
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='sex', hue='depressed')
plt.show()

In [None]:
# Explore relationships between multiple features
plt.figure(figsize=(10, 8))
sns.pairplot(df, vars=['Age', 'education_level', 'total_members', 'gained_asset'], hue='depressed')
plt.show()

In [None]:
# Check for multicollinearity
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Explore the relationship between depression and age
# plt.figure(figsize=(10, 6))
# sns.distplot(df['income_diversity'][df['depressed'] == 1], kde=False, hist=True, label='Depressed')
# # sns.distplot(df['Age'][df['depressed'] == 0], kde=True, hist=True, label='Not Depressed')
# plt.title('Income Distribution by Depression Status')
# plt.xlabel('Income')
# plt.ylabel('Density')
# plt.legend()
# plt.show()

In [None]:
df.describe()

In [None]:
# Explore the relationship between depression and marital status
marital_counts = df.groupby(['Married', 'depressed'])['Survey_id'].count().unstack('depressed')
print(marital_counts)

In [None]:
# Explore the relationship between depression and education level
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='education_level', hue='depressed')
plt.title('Education Level Distribution by Depression Status')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Explore the relationship between depression and income sources
income_sources = ['incoming_salary', 'incoming_own_farm', 'incoming_business', 'incoming_no_business', 'incoming_agricultural']
income_counts = df.groupby(['depressed'])[income_sources].sum()
print(income_counts)