**Data Cleaning & Preprocessing**

In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("survey.csv")  # Change to your dataset filename

# Display basic info
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 14  wellness

Check for null values

In [2]:
print(df.isnull().sum())

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64


 Drop irrelevant columns

In [3]:
df = df.drop(columns=['Timestamp', 'comments'], errors='ignore')

**Handling Missing Values**

In [4]:
df['self_employed'].fillna('No', inplace=True)  # Assume 'No' for missing values
df['work_interfere'].fillna('Unknown', inplace=True)  # Replace missing with 'Unknown'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['self_employed'].fillna('No', inplace=True)  # Assume 'No' for missing values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['work_interfere'].fillna('Unknown', inplace=True)  # Replace missing with 'Unknown'


In [5]:
# Dropping 'state' column as it has too many missing values
df.drop(columns=['state'], inplace=True)

Fixing Age Column (Removing Outliers)

In [6]:
df = df[(df['Age'] >= 18) & (df['Age'] <= 100)]


**Handling Categorical Variables**

In [7]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [8]:
# Label Encoding for Target Variable ('treatment' - Yes/No to 1/0)
label_encoder = LabelEncoder()
df['treatment'] = label_encoder.fit_transform(df['treatment'])

In [9]:
# One-Hot Encoding for Categorical Features
categorical_columns = ['Gender', 'Country', 'self_employed', 'family_history', 'work_interfere',
                       'no_employees', 'remote_work', 'tech_company', 'benefits', 'care_options',
                       'wellness_program', 'seek_help', 'anonymity', 'leave',
                       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
                       'supervisor', 'mental_health_interview', 'phys_health_interview',
                       'mental_vs_physical', 'obs_consequence']

df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

**Final Data Summary**

In [10]:
print("Cleaned Dataset Shape:", df_encoded.shape)
print(df_encoded.head())

Cleaned Dataset Shape: (1251, 134)
   Age  treatment  Gender_Androgyne  Gender_Cis Female  Gender_Cis Male  \
0   37          1             False              False            False   
1   44          0             False              False            False   
2   32          0             False              False            False   
3   31          1             False              False            False   
4   31          0             False              False            False   

   Gender_Cis Man  Gender_Enby  Gender_F  Gender_Femake  Gender_Female  ...  \
0           False        False     False          False           True  ...   
1           False        False     False          False          False  ...   
2           False        False     False          False          False  ...   
3           False        False     False          False          False  ...   
4           False        False     False          False          False  ...   

   coworkers_Yes  supervisor_Some of th

# **Step 2: Model Development 🚀**

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

In [14]:
X = df.drop(columns=['treatment'])  # Features
y = df['treatment']  # Target Variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Model Training & Evaluation

In [26]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "XGBoost": XGBClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(random_state=42)
    }

trained_models = {}
for model_name, model in models.items():
  model.fit(X_train, y_train)
  trained_models[model_name] = model

**Evaluation Metrics**

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

for model_name, model in trained_models.items():
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class
  recall = recall_score(y_test, y_pred, average='weighted')
  f1 = f1_score(y_test, y_pred, average='weighted')
  print(f"Model: {model_name}")
  print(f"Accuracy: {accuracy:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall: {recall:.4f}")
  print(f"F1-score: {f1:.4f}")
  print("-" * 20)

Model: Random Forest
Accuracy: 0.8486
Precision: 0.8557
Recall: 0.8486
F1-score: 0.8477
--------------------
Model: XGBoost
Accuracy: 0.8406
Precision: 0.8436
Recall: 0.8406
F1-score: 0.8402
--------------------
Model: Logistic Regression
Accuracy: 0.8526
Precision: 0.8536
Recall: 0.8526
F1-score: 0.8524
--------------------


# ** Developing a Basic UI (Streamlit) and CLI Interface**

In [29]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[

In [31]:
import streamlit as st
import pandas as pd
import joblib  # For loading the trained model

# Save the desired model before loading
# Assuming you want to save the Logistic Regression Model
joblib.dump(trained_models["Logistic Regression"], 'trained_model.pkl')

# Load the trained model
model = joblib.load('trained_model.pkl')  # Replace 'trained_model.pkl' with the actual model filename

# ... (rest of your Streamlit app code) ...