## Preprocessing Based on EDA

### Importing libraries

In [102]:
import numpy as np
import pandas as pd
import os
from zipfile import ZipFile
import warnings
warnings.filterwarnings('ignore')
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer


### Loading Data

In [103]:
path="../Data/Early_Suicide_Prediction.zip"
with ZipFile(path) as zipread:
    zipread.printdir()

File Name                                             Modified             Size
Final_SP_dataSet.csv                           2025-06-30 17:54:52       100040


In [104]:
with ZipFile(path) as zipread:
    with zipread.open("Final_SP_dataSet.csv") as file:
        data_raw=pd.read_csv(file)

# view head of the data
data_raw.head()

Unnamed: 0,Age,Gender,Stress Level,Academic Performance,Health Condition,Relationship Condition,Family Problem,Depression Level,Anxiety Level,Mental Support,Self Harm Story,Suicide Attempt
0,22,Female,Low,Good,Normal,In a relationship,,Always,Always,Family,No,Never Thought
1,23,Male,High,Good,Fair,Single,,Always,Often,loneliness,Yes,Attempted
2,19,Female,Moderate,Poor,Normal,Breakup,Parental conflict,Often,Often,Friends,Yes,Thought
3,18,Female,High,Average,Abnormal,Single,Financial,Always,Often,loneliness,Yes,Thought
4,19,Female,Low,Good,Fair,Single,,Sometimes,Sometimes,Family,No,Never Thought


In [105]:
# let make a safety copy of our data
df=data_raw.copy()

### Data Cleaning and Transformation

In [106]:
# let fill NAN in 'Family Problem' column with 'No' as suggested in EDA
df['Family Problem'].fillna('No',inplace=True)
df.head()

Unnamed: 0,Age,Gender,Stress Level,Academic Performance,Health Condition,Relationship Condition,Family Problem,Depression Level,Anxiety Level,Mental Support,Self Harm Story,Suicide Attempt
0,22,Female,Low,Good,Normal,In a relationship,No,Always,Always,Family,No,Never Thought
1,23,Male,High,Good,Fair,Single,No,Always,Often,loneliness,Yes,Attempted
2,19,Female,Moderate,Poor,Normal,Breakup,Parental conflict,Often,Often,Friends,Yes,Thought
3,18,Female,High,Average,Abnormal,Single,Financial,Always,Often,loneliness,Yes,Thought
4,19,Female,Low,Good,Fair,Single,No,Sometimes,Sometimes,Family,No,Never Thought


In [107]:
# removing of duplicate data as suggested from EDA
print(f"Numbers of duplicates:{df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Numbers of duplicates after remove:{df.duplicated().sum()}")


Numbers of duplicates:45
Numbers of duplicates after remove:0


In [108]:
# drop row with wrong input as suggested in EDA
print(f"Before Remove:{df['Anxiety Level'].value_counts()}\n")
df = df.query("`Anxiety Level` != 'sometimes'")
print(f"After Remove:{df['Anxiety Level'].value_counts()}")


Before Remove:Anxiety Level
Often        456
Sometimes    435
Always       162
sometimes      1
Name: count, dtype: int64

After Remove:Anxiety Level
Often        456
Sometimes    435
Always       162
Name: count, dtype: int64


In [109]:
# drop row with wrong input as suggested in EDA
print(f"Before Remove:{df['Depression Level'].value_counts()}\n")
df = df.query("`Depression Level` != 'pf'")
print(f"After Remove:{df['Depression Level'].value_counts()}")


Before Remove:Depression Level
Sometimes    501
Often        364
Always       187
pf             1
Name: count, dtype: int64

After Remove:Depression Level
Sometimes    501
Often        364
Always       187
Name: count, dtype: int64


### Let split our data

In [110]:
X=df.drop(columns=["Suicide Attempt"])
y=df["Suicide Attempt"]

### Encoding Categorical Variables

In [112]:
# column definition
num_col= X.select_dtypes(include='number').columns
cat_col= X.select_dtypes(include='object').columns

In [None]:

# label encoding for Gender
label_enc_col=["Gender","Self Harm Story"]

# Ordinal encoding
ordinal_enc_col=["Stress Level","Depression Level","Anxiety Level","Academic Performance","Health Condition"]

#one_hot_encoding
onehot_enc_col= [x for x in cat_col if x not in ordinal_enc_col and x not in label_enc_col]
# onehot_enc_col=["Relationship Condition","Family Problem","Mental Support"]

# initilizind the encoders
label_encoder=LabelEncoder()
onehot_encoder=OneHotEncoder()
ordinal_encoder= OrdinalEncoder()

print("Label columns:", label_enc_col)
print("Ordinal columns:", ordinal_enc_col)
print("One-hot categorical columns:", onehot_enc_col)
print("Numeric columns:", num_col)
