## Preprocessing Based on EDA

### Importing libraries

In [None]:
import numpy as np
import pandas as pd
import os
from zipfile import ZipFile
import warnings
warnings.filterwarnings('ignore')
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline


### Loading Data

In [3]:
path="../Data/Early_Suicide_Prediction.csv"
data_raw=pd.read_csv(path)

In [4]:
# path="../Data/Early_Suicide_Prediction.zip"
# with ZipFile(path) as zipread:
#     zipread.printdir()

In [5]:
# with ZipFile(path) as zipread:
#     with zipread.open("Final_SP_dataSet.csv") as file:
#         data_raw=pd.read_csv(file)

# # view head of the data

In [6]:
data_raw.head()

Unnamed: 0,Age,Gender,Stress Level,Academic Performance,Health Condition,Relationship Condition,Family Problem,Depression Level,Anxiety Level,Mental Support,Self Harm Story,Suicide Attempt
0,22,Female,Low,Good,Normal,In a relationship,,Always,Always,Family,No,Never Thought
1,23,Male,High,Good,Fair,Single,,Always,Often,loneliness,Yes,Attempted
2,19,Female,Moderate,Poor,Normal,Breakup,Parental conflict,Often,Often,Friends,Yes,Thought
3,18,Female,High,Average,Abnormal,Single,Financial,Always,Often,loneliness,Yes,Thought
4,19,Female,Low,Good,Fair,Single,,Sometimes,Sometimes,Family,No,Never Thought


In [7]:
# let make a safety copy of our data
df=data_raw.copy()

### Data Cleaning and Transformation

In [8]:
# let fill NAN in 'Family Problem' column with 'No' as suggested in EDA
df['Family Problem'].fillna('No',inplace=True)
df.head()

Unnamed: 0,Age,Gender,Stress Level,Academic Performance,Health Condition,Relationship Condition,Family Problem,Depression Level,Anxiety Level,Mental Support,Self Harm Story,Suicide Attempt
0,22,Female,Low,Good,Normal,In a relationship,No,Always,Always,Family,No,Never Thought
1,23,Male,High,Good,Fair,Single,No,Always,Often,loneliness,Yes,Attempted
2,19,Female,Moderate,Poor,Normal,Breakup,Parental conflict,Often,Often,Friends,Yes,Thought
3,18,Female,High,Average,Abnormal,Single,Financial,Always,Often,loneliness,Yes,Thought
4,19,Female,Low,Good,Fair,Single,No,Sometimes,Sometimes,Family,No,Never Thought


In [9]:
# removing of duplicate data as suggested from EDA
print(f"Numbers of duplicates:{df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Numbers of duplicates after remove:{df.duplicated().sum()}")


Numbers of duplicates:45
Numbers of duplicates after remove:0


In [10]:
# drop row with wrong input as suggested in EDA
print(f"Before Remove:{df['Anxiety Level'].value_counts()}\n")
df['Anxiety Level'] = df['Anxiety Level'].replace({'sometimes':'Sometimes'})
print(f"After Remove:{df['Anxiety Level'].value_counts()}")


Before Remove:Anxiety Level
Often        456
Sometimes    435
Always       162
sometimes      1
Name: count, dtype: int64

After Remove:Anxiety Level
Often        456
Sometimes    436
Always       162
Name: count, dtype: int64


In [11]:
# replace row with wrong input as suggested in EDA
print(f"Before Remove:{df['Depression Level'].value_counts()}\n")
df['Depression Level'] = df['Depression Level'].replace({'pf':'Sometimes'})
print(f"After Remove:{df['Depression Level'].value_counts()}")


Before Remove:Depression Level
Sometimes    501
Often        364
Always       188
pf             1
Name: count, dtype: int64

After Remove:Depression Level
Sometimes    502
Often        364
Always       188
Name: count, dtype: int64


### Let split our data

In [12]:
# let do mapping for the target
mapping={"Never Thought":0,"Thought":1,"Attempted":1}
df['target']=df['Suicide Attempt'].map(mapping)

In [18]:
X=df.drop(columns=["Suicide Attempt","target"])
y=df['target']

### Encoding Categorical Variables

In [15]:
# column definition
num_col= X.select_dtypes(include='number').columns
cat_col= X.select_dtypes(include='object').columns

In [None]:

# Ordinal encoding
ordinal_enc_col=["Stress Level","Depression Level","Anxiety Level","Academic Performance","Health Condition"]

#one_hot_encoding
onehot_enc_col=["Gender","Self Harm Story","Relationship Condition","Family Problem","Mental Support"]

# setting order for the categories
ordinal_categories = [
    ["Low", "Moderate", "High"],                 # Stress Level
    ["Sometimes", "Often", "Always"],            # Depression Level
    ["Sometimes", "Often", "Always"],            # Anxiety Level
    ["Poor", "Average", "Good", "Excellent"],    # Academic Performance
    ["Abnormal", "Fair", "Normal"]               # Health Condition
]

# initilizind the encoders
onehot_encoder=OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False)
ordinal_encoder= OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1)

preprocessor= ColumnTransformer(
    transformers=[
        ("onehot",onehot_encoder,onehot_enc_col),
        ("ordinal",ordinal_encoder, ordinal_enc_col),
        ("num", SimpleImputer(strategy='mean'), num_col)
    ]
) 


Label columns: ['Gender', 'Self Harm Story']
Ordinal columns: ['Stress Level', 'Depression Level', 'Anxiety Level', 'Academic Performance', 'Health Condition']
One-hot categorical columns: ['Relationship Condition', 'Family Problem', 'Mental Support']
Numeric columns: Index(['Age', 'target'], dtype='object')


### Building the Pipeline

In [None]:
pipeline=ImbPipeline(steps=[
    ('preproc', preprocessor),
    ('smote', SMO)
])