In [27]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_excel(r"C:\Users\divya\OneDrive\Documents\SEM4\Capstone Project\Capstone Project\Data\Processed\Primary_Survey\PrimaryData_fmt.xlsx")

In [29]:
# examine the 'Problem_Depression' column to understand its distribution and how we can define the target variable
df['Problem_Depression'].value_counts(dropna=False)


Problem_Depression
Sometimes        68
Rarely           40
Not at all       23
Often            21
Almost always     5
Name: count, dtype: int64

In [30]:
# target variable based on the 'Problem_Depression' column
df['Mental_Health_Issue'] = df['Problem_Depression'].map({'Not at all': 0, 'Rarely': 0, 'Sometimes': 1, 'Often': 1, 'Almost always': 1})

# Check the distribution of the new target variable
target_distribution = df['Mental_Health_Issue'].value_counts(normalize=True)
target_distribution


Mental_Health_Issue
1    0.598726
0    0.401274
Name: proportion, dtype: float64

The Problem_Depression column contains responses categorized as "Not at all", "Rarely", "Sometimes", "Often", and "Almost always". Based on these responses, we can define the target variable for mental health issues as follows:

0 (No Significant Mental Health Issue): Responses "Not at all" and "Rarely".
1 (Potential Mental Health Issue): Responses "Sometimes", "Often", and "Almost always".


The target variable Mental_Health_Issue has been successfully created, with approximately 60% of the instances indicating potential mental health issues (coded as 1) and 40% indicating no significant mental health issues (coded as 0).

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Selecting features for the model,
feature_columns = ['Age', 'Gender', 'Current_Status', 'Problem_Interest', 'Problem_Sleep', 
                   'Problem_Energy', 'Problem_Appetite', 'Problem_Self_Doubt', 'Problem_Concentration', 
                   'Problem_Restlessness', 'Problem_Suicidal_Thoughts', 'Complete_Isolation', 
                   'Connection_Loss_Society', 'Friendship_Quality', 'Engagement_Group', 
                   'Prioritize_Well_Being', 'Current_Diet','Physical_Activity_Frequency','Sleep_Hours']

# Define the features and the target
X = df[feature_columns]
y = df['Mental_Health_Issue']

# Handling categorical variables and missing values
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the training and testing data
X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

X_train_prepared.shape, X_test_prepared.shape


((125, 95), (32, 95))

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the prepared training data
dt_clf.fit(X_train_prepared, y_train)

# Predict on the testing set
y_pred = dt_clf.predict(X_test_prepared)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(0.6875, 0.6666666666666666, 0.75, 0.7058823529411765)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Initialize the models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

# Dictionary to store models and their performance
models = {
    "Logistic Regression": log_reg,
    "Random Forest": rf_clf,
    "Gradient Boosting": gb_clf
}

performance = {}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train_prepared, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test_prepared)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store the metrics
    performance[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

performance


{'Logistic Regression': {'Accuracy': 0.625,
  'Precision': 0.5909090909090909,
  'Recall': 0.8125,
  'F1 Score': 0.6842105263157895},
 'Random Forest': {'Accuracy': 0.71875,
  'Precision': 0.6666666666666666,
  'Recall': 0.875,
  'F1 Score': 0.7567567567567568},
 'Gradient Boosting': {'Accuracy': 0.6875,
  'Precision': 0.65,
  'Recall': 0.8125,
  'F1 Score': 0.7222222222222222}}