<a href="https://colab.research.google.com/github/TheGooseGuy/Machine-Learning-on-Suicide-and-Depression-Detection/blob/main/Data_Cleaning%26Resampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)

In [2]:
import kagglehub

# Download selected version
path = kagglehub.dataset_download("nikhileswarkomati/suicide-watch/versions/13")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nikhileswarkomati/suicide-watch?dataset_version_number=13...


100%|██████████| 115M/115M [00:01<00:00, 68.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/nikhileswarkomati/suicide-watch/versions/13


In [3]:
print(os.listdir(path)) # path is a directory

['SuicideAndDepression_Detection.csv']


In [4]:
file_path = os.path.join(path, "SuicideAndDepression_Detection.csv")
data = pd.read_csv(file_path)
data.head(20)

Unnamed: 0,text,class
0,Does life actually work for most / non-depressed people?It doesn't seem poss...,depression
1,"I found my friend's bodyIt was almost nine years ago now, but I still think ...",depression
2,Ex Wife Threatening SuicideRecently I left my wife for good because she has ...,SuicideWatch
3,Am I weird I don't get affected by compliments if it's coming from someone I...,teenagers
4,"Finally 2020 is almost over... So I can never hear ""2020 has been a bad year...",teenagers
5,"Reddit, I've never opened up to anyone with my life problems as much i am no...",depression
6,Somebody help me.I just had a terrible episode tonight. I feel hollow inside...,depression
7,I can't do this anymoreI've hidden away all summer in my room and I can't ev...,depression
8,i need helpjust help me im crying so hard,SuicideWatch
9,"I’m so lostHello, my name is Adam (16) and I’ve been struggling for years an...",SuicideWatch


# Data Cleaning&Preparation

In [5]:
# Three classes in total, all balanced.
data['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
SuicideWatch,116037
teenagers,116037
depression,116036


In [9]:
# Missing data only accounts for a extremely small part of the whole dataset.
data_len = len(data)
text_missing = data['text'].isna().sum() / data_len
print("Percentage of missing text data: ", "less than 1%" if text_missing < 0.01 else "{:.4f}".format(x))

class_missing = data['class'].isna().sum() / data_len
print("Percentage of missing class data: ", "less than 1%" if class_missing < 0.01 else "{:.4f}".format(x))

Percentage of missing text data:  less than 1%
Percentage of missing class data:  less than 1%


In [11]:
# Drop NA values, reset index
data.dropna(inplace = True)
data.reset_index(drop = True, inplace = True)

print(data['text'].isna().sum())
print(data['class'].isna().sum())

0
0


In [14]:
# No duplication in data
print(data['text'].duplicated().sum())

0


# Random Sampling

In [20]:
classes = data['class'].unique()
class_size = 200  # 600 total, 3 classes

sampled_data = pd.DataFrame()
for cls in classes:
    class_data = data[data['class'] == cls]
    sampled_class_data = class_data.sample(n = class_size, random_state = 64)
    sampled_data = pd.concat([sampled_data, sampled_class_data])

# Reset Index
sampled_data.reset_index(drop = True, inplace = True)

In [21]:
sampled_data.to_csv('sampled_data_600.csv', index = False)