### Import Libraries

In [None]:
import numpy as np
import pandas as pd

### Load Dataset

In [None]:
import pandas as pd
df = pd.read_csv('survey.csv')


In [None]:
# Preview the dataset
print(df.head())

# Check dimensions
print("Shape:", df.shape)

# Check data types and missing values
print(df.info())

# Summary statistics for numerical columns
print(df.describe())
#print("Duplicate Rows:", df.duplicated().sum())

# Drop irrelevant columns for EDA
df.drop(columns=["Timestamp", "state", "comments","self_employed"], inplace=True)



## EDA

### Summary Statistics (mean, median, mode, etc.)

In [None]:
print(df.describe(include='all'))
print("Mode values:\n", df.mode().iloc[0])


 ### Missing Value Analysis

In [None]:
print(df.isnull().sum())
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()


### Data Types & Unique Value Counts

In [None]:
print(df.dtypes)
print("\nUnique values per column:\n", df.nunique())


### Outlier Detection (e.g., for Age)

In [None]:
plt.figure()
sns.boxplot(x=df['Age'])
plt.title("Age Outlier Detection")
plt.show()
print("Ages under 18 or over 80:\n", df[(df['Age'] < 18) | (df['Age'] > 80)])


### Feature Distribution (Histogram)

In [None]:

sns.histplot(df['Age'], bins=30)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()



### Gender Distribution

In [None]:

sns.countplot(x='Gender', data=df)
plt.title("Gender Distribution")
plt.xticks(rotation=45)
plt.show()

### Target Variable (Treatment) Distribution

In [None]:

sns.countplot(x='treatment', data=df)
plt.title("Treatment Distribution")
plt.show()

### Work Interfere vs Treatment


In [None]:
sns.countplot(x='work_interfere', hue='treatment', data=df)
plt.title("Work Interfere vs Treatment")
plt.xticks(rotation=45)
plt.show()

### Remote Work vs Treatment


In [None]:
sns.countplot(x='remote_work', hue='treatment', data=df)
plt.title("Remote Work vs Treatment")
plt.show()

### Country-wise Treatment Rate


In [None]:
country_treatment = df.groupby('Country')['treatment'].value_counts(normalize=True).unstack()
country_treatment.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title("Treatment by Country")
plt.ylabel("Proportion")
plt.show()

### Anonymity vs Seek Help


In [None]:
sns.countplot(x='anonymity', hue='seek_help', data=df)
plt.title("Anonymity vs Seek Help")
plt.xticks(rotation=45)
plt.show()


### Grouped Aggregation - Family History vs Treatment

In [None]:

print("\nFamily History vs Treatment:\n")
print(df.groupby('family_history')['treatment'].value_counts(normalize=True))


### Care Options vs Treatment

In [None]:
sns.countplot(x='care_options', hue='treatment', data=df)
plt.title("Care Options vs Treatment")
plt.xticks(rotation=45)
plt.show()

### Mental vs Physical Health Consequence


In [None]:
sns.countplot(x='mental_vs_physical', hue='treatment', data=df)
plt.title("Mental vs Physical Health Perception vs Treatment")
plt.xticks(rotation=45)
plt.show()

### Correlation Heatmap (on encoded categorical data for analysis only)

In [None]:

df_encoded = df.copy()
df_encoded = df_encoded.apply(lambda col: pd.factorize(col)[0] if col.dtypes == 'object' else col)
plt.figure(figsize=(12, 8))
sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()


### Pairwise Feature Relationships


In [None]:
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

pairplot_df = df[['Age', 'Gender', 'remote_work', 'treatment']].copy()
le = LabelEncoder()
pairplot_df['Gender'] = le.fit_transform(pairplot_df['Gender'])
pairplot_df['remote_work'] = le.fit_transform(pairplot_df['remote_work'])
pairplot_df['treatment'] = le.fit_transform(pairplot_df['treatment'])

sample_df = pairplot_df.sample(n=300, random_state=42)

sns.pairplot(sample_df)
plt.suptitle("Pairwise Feature Relationships", y=1.02)
plt.show()


# Data Preprocessing

### Check for Missing Values

In [None]:

print("\nMissing values in each column:")
print(df.isnull().sum())
print()
df['self_employed'].fillna(df['self_employed'].mode()[0], inplace=True)
df['work_interfere'].fillna(df['work_interfere'].mode()[0], inplace=True)

### Clean and Encode Categorical Variables

In [None]:
def clean_gender(g):
    g = str(g).strip().lower()
    if g in ['male', 'm', 'man']:
        return 'Male'
    elif g in ['female', 'f', 'woman']:
        return 'Female'
    else:
        return 'Other'

df['Gender'] = df['Gender'].apply(clean_gender)


### Label Encode all categorical features

In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

encoders = {}
categorical_columns = ['Gender', 'remote_work', 'family_history', 'work_interfere',
                       'benefits', 'anonymity', 'leave']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

joblib.dump(encoders, 'encoders.pkl')


### Normalize/Scale Numerical Features


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])


### Split Into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('treatment', axis=1)
y = df['treatment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Confirm shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

joblib.dump(X.columns.tolist(), 'model_columns.pkl')



# Train  Model

#  Random Forest Classification








In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, mean_squared_error

In [None]:
# 1. Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# 2. Predict on test set
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, classification_report, confusion_matrix
import numpy as np

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # <- fix here

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("RMSE:", rmse)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
import joblib

# Save the model
joblib.dump(model, 'model.pkl')

# Optionally save the encoder/scaler if used
joblib.dump(scaler, 'scaler.pkl')  # if you used a scaler
joblib.dump(le, 'label_encoder.pkl')  # if you're using label encoder

