In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")


In [2]:
df=pd.read_csv(r"C:\Users\Rahul\Desktop\hackathon\employee_train.csv")

In [3]:
df.head(5)

Unnamed: 0,S.No,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,1,8/27/2014 11:29,37,Female,United States,IL,,No,Yes,Often,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2,8/27/2014 11:29,44,M,United States,IN,,No,No,Rarely,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,3,8/27/2014 11:29,32,Male,Canada,,,No,No,Rarely,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,4,8/27/2014 11:29,31,Male,United Kingdom,,,Yes,Yes,Often,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,5,8/27/2014 11:30,31,Male,United States,TX,,No,No,Never,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [4]:
df.shape

(1048, 28)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048 entries, 0 to 1047
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   S.No                       1048 non-null   int64 
 1   Timestamp                  1048 non-null   object
 2   Age                        1048 non-null   int64 
 3   Gender                     1048 non-null   object
 4   Country                    1048 non-null   object
 5   state                      636 non-null    object
 6   self_employed              1030 non-null   object
 7   family_history             1048 non-null   object
 8   treatment                  1048 non-null   object
 9   work_interfere             812 non-null    object
 10  no_employees               1048 non-null   object
 11  remote_work                1048 non-null   object
 12  tech_company               1048 non-null   object
 13  benefits                   1035 non-null   object
 14  care_opt

In [6]:
missing_value=round(df.isnull().sum()/len(df.index),2)*100
missing_columns =missing_value[missing_value>0]
missing_columns

state             39.0
self_employed      2.0
work_interfere    23.0
benefits           1.0
comments          87.0
dtype: float64

In [7]:
# Drop unnecessary columns
df.drop(["comments", "Timestamp", "S.No"], axis=1, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048 entries, 0 to 1047
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Age                        1048 non-null   int64 
 1   Gender                     1048 non-null   object
 2   Country                    1048 non-null   object
 3   state                      636 non-null    object
 4   self_employed              1030 non-null   object
 5   family_history             1048 non-null   object
 6   treatment                  1048 non-null   object
 7   work_interfere             812 non-null    object
 8   no_employees               1048 non-null   object
 9   remote_work                1048 non-null   object
 10  tech_company               1048 non-null   object
 11  benefits                   1035 non-null   object
 12  care_options               1048 non-null   object
 13  wellness_program           1044 non-null   object
 14  seek_hel

In [9]:
df.shape

(1048, 25)

In [10]:
# Handling missing values
df["state"].fillna("Unknown", inplace=True)
df["self_employed"].fillna(df["self_employed"].mode()[0], inplace=True)
df["work_interfere"].fillna(df["work_interfere"].mode()[0], inplace=True)
df["benefits"].fillna(df["benefits"].mode()[0], inplace=True)


In [11]:
round(df.isnull().sum()/len(df.index),2)*100

Age                          0.0
Gender                       0.0
Country                      0.0
state                        0.0
self_employed                0.0
family_history               0.0
treatment                    0.0
work_interfere               0.0
no_employees                 0.0
remote_work                  0.0
tech_company                 0.0
benefits                     0.0
care_options                 0.0
wellness_program             0.0
seek_help                    0.0
anonymity                    0.0
leave                        0.0
mental_health_consequence    0.0
phys_health_consequence      0.0
coworkers                    0.0
supervisor                   0.0
mental_health_interview      0.0
phys_health_interview        0.0
mental_vs_physical           0.0
obs_consequence              0.0
dtype: float64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048 entries, 0 to 1047
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Age                        1048 non-null   int64 
 1   Gender                     1048 non-null   object
 2   Country                    1048 non-null   object
 3   state                      1048 non-null   object
 4   self_employed              1048 non-null   object
 5   family_history             1048 non-null   object
 6   treatment                  1048 non-null   object
 7   work_interfere             1048 non-null   object
 8   no_employees               1048 non-null   object
 9   remote_work                1048 non-null   object
 10  tech_company               1048 non-null   object
 11  benefits                   1048 non-null   object
 12  care_options               1048 non-null   object
 13  wellness_program           1044 non-null   object
 14  seek_hel

In [13]:
# Encode categorical features using One-Hot Encoding
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [14]:
df

Unnamed: 0,Age,Gender_Agender,Gender_All,Gender_Androgyne,Gender_Cis Female,Gender_Cis Male,Gender_Enby,Gender_F,Gender_Femake,Gender_Female,...,coworkers_Yes,supervisor_Some of them,supervisor_Yes,mental_health_interview_No,mental_health_interview_Yes,phys_health_interview_No,phys_health_interview_Yes,mental_vs_physical_No,mental_vs_physical_Yes,obs_consequence_Yes
0,37,False,False,False,False,False,False,False,False,True,...,False,False,True,True,False,False,False,False,True,False
1,44,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
2,32,False,False,False,False,False,False,False,False,False,...,True,False,True,False,True,False,True,True,False,False
3,31,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
4,31,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043,26,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1044,29,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,True,True,False,True
1045,26,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
1046,33,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False


In [15]:
# Feature engineering
df["Mental_Health_Awareness_Score"] = df["benefits_Yes"] + df["care_options_Yes"] + df["seek_help_Yes"] + df["wellness_program_Yes"]

In [16]:
# Define target variable
X = df.drop("treatment_Yes", axis=1)
y = df["treatment_Yes"]

In [17]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [18]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)


In [19]:
# Rename columns to remove special characters (for LightGBM compatibility)
X_train.columns = X_train.columns.str.replace("[^a-zA-Z0-9_]", "_", regex=True)
X_test.columns = X_test.columns.str.replace("[^a-zA-Z0-9_]", "_", regex=True)


In [20]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)


In [21]:
from sklearn.preprocessing import MinMaxScaler


In [22]:
scaler = MinMaxScaler()


In [23]:
X_train = scaler.fit_transform(X_train)


In [24]:
X_test = scaler.fit_transform(X_test)


In [25]:
#y_train = scaler.fit_transform(y_train)
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

In [26]:
y_test = scaler.fit_transform(y_test.values.reshape(-1, 1))

In [27]:
# Model training and hyperparameter tuning
models = {
    "RandomForest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier()
}

param_grid = {
    "RandomForest": {"n_estimators": [100, 200], "max_depth": [10, 20]},
    "XGBoost": {"learning_rate": [0.01, 0.1], "n_estimators": [100, 200]},
    "LightGBM": {"learning_rate": [0.01, 0.1], "n_estimators": [100, 200]}
}

best_models = {}

for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], scoring="f1", cv=5, verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[LightGBM] [Info] Number of positive: 375, number of negative: 375
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 155
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [28]:
# Evaluate models
#for name, model in best_models.items():
    #y_pred = model.predict(X_test)
    #f1 = f1_score(y_test, y_pred)
    #print(f"{name} F1 Score: {f1:.4f}")

In [62]:

# Choose the best model and adjust the decision threshold
#best_model = best_models["XGBoost"]  # Assuming XGBoost performs best
#y_probs = best_model.predict_proba(X_test)[:, 1]


In [65]:
# Start with logistic regression
from sklearn.linear_model import LogisticRegression

In [67]:
lg=LogisticRegression()

In [69]:
lg.fit(X_train,y_train)

In [70]:
y_predlg=lg.predict(X_train)
from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(y_train,y_predlg))

              precision    recall  f1-score   support

       False       0.80      0.81      0.81       375
        True       0.81      0.80      0.81       375

    accuracy                           0.81       750
   macro avg       0.81      0.81      0.81       750
weighted avg       0.81      0.81      0.81       750



In [72]:
# modelling with support vector machines(classifier)
#from sklearn.svm import SVC

In [74]:
#svc=SVC()

In [76]:
#svc.fit(X_train,y_train)

In [79]:
#y_pred_svc=svc.predict(X_train)
#print(classification_report(y_train,y_pred_svc))

In [81]:
# modelling with decision tree classifier
#from sklearn.tree import DecisionTreeClassifier

In [83]:
#dt=DecisionTreeClassifier()

In [85]:
#dt.fit(X_train,y_train)

In [87]:
#y_pred_dt=dt.predict(X_train)

In [89]:
#print(classification_report(y_train,y_pred_dt))

In [91]:
# Load test data
df_test = pd.read_csv(r"C:\Users\Rahul\Desktop\hackathon\employee_test.csv")  # Change filename if needed



In [93]:
df_test

Unnamed: 0,S.No,Timestamp,Age,Gender,Country,state,self_employed,family_history,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,1,8/29/2014 11:32,39,Male,United Kingdom,,Yes,Yes,Sometimes,5-Jan,...,Somewhat difficult,No,No,Yes,Yes,No,Maybe,Yes,Yes,These result may be a tad confusing so a summa...
1,2,8/29/2014 11:32,26,female,United States,WA,No,Yes,Sometimes,More than 1000,...,Don't know,No,No,Some of them,Yes,No,Maybe,No,Yes,I should note one of the places my employer fa...
2,3,8/29/2014 11:33,23,Female,United States,IL,No,Yes,Sometimes,26-100,...,Somewhat difficult,Yes,No,No,Some of them,No,Maybe,No,No,
3,4,8/29/2014 11:34,35,Male,Switzerland,,No,Yes,Often,More than 1000,...,Very easy,No,No,Some of them,Some of them,No,Maybe,No,No,
4,5,8/29/2014 11:36,36,Male,United States,FL,No,No,Never,5-Jan,...,Very easy,No,No,Some of them,Some of them,No,No,Don't know,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,206,9/12/2015 11:17,26,male,United Kingdom,,No,No,,26-100,...,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No,
206,207,9/26/2015 1:07,32,Male,United States,IL,No,Yes,Often,26-100,...,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No,
207,208,11/7/2015 12:36,34,male,United States,CA,No,Yes,Sometimes,More than 1000,...,Somewhat difficult,Yes,Yes,No,No,No,No,No,No,
208,209,11/30/2015 21:25,46,f,United States,NC,No,No,,100-500,...,Don't know,Yes,No,No,No,No,No,No,No,


In [95]:
# Drop unnecessary columns
df_test.drop(["comments", "Timestamp", "S.No"], axis=1, inplace=True)



In [97]:
df_test

Unnamed: 0,Age,Gender,Country,state,self_employed,family_history,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,39,Male,United Kingdom,,Yes,Yes,Sometimes,5-Jan,Yes,Yes,...,Yes,Somewhat difficult,No,No,Yes,Yes,No,Maybe,Yes,Yes
1,26,female,United States,WA,No,Yes,Sometimes,More than 1000,No,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,No,Maybe,No,Yes
2,23,Female,United States,IL,No,Yes,Sometimes,26-100,No,No,...,Don't know,Somewhat difficult,Yes,No,No,Some of them,No,Maybe,No,No
3,35,Male,Switzerland,,No,Yes,Often,More than 1000,No,Yes,...,Yes,Very easy,No,No,Some of them,Some of them,No,Maybe,No,No
4,36,Male,United States,FL,No,No,Never,5-Jan,Yes,Yes,...,Don't know,Very easy,No,No,Some of them,Some of them,No,No,Don't know,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,26,male,United Kingdom,,No,No,,26-100,No,Yes,...,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No
206,32,Male,United States,IL,No,Yes,Often,26-100,Yes,Yes,...,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No
207,34,male,United States,CA,No,Yes,Sometimes,More than 1000,No,Yes,...,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No,No,No
208,46,f,United States,NC,No,No,,100-500,Yes,Yes,...,Don't know,Don't know,Yes,No,No,No,No,No,No,No


In [99]:
df_test.isnull().sum()

Age                            0
Gender                         0
Country                        0
state                        103
self_employed                  0
family_history                 0
work_interfere                28
no_employees                   0
remote_work                    0
tech_company                   0
benefits                       3
care_options                   0
wellness_program               2
seek_help                      0
anonymity                      0
leave                          5
mental_health_consequence      0
phys_health_consequence        0
coworkers                      0
supervisor                     0
mental_health_interview        0
phys_health_interview          0
mental_vs_physical             0
obs_consequence                0
dtype: int64

In [101]:
# Handle missing values (same as training data)
df_test["state"].fillna("Unknown", inplace=True)
df_test["self_employed"].fillna(df_test["self_employed"].mode()[0], inplace=True)
df_test["work_interfere"].fillna(df_test["work_interfere"].mode()[0], inplace=True)
df_test["benefits"].fillna(df_test["benefits"].mode()[0], inplace=True)



In [102]:
# Ensure 'treatment' is not included in categorical_cols
categorical_cols = [col for col in categorical_cols if col != "treatment"]

# Apply One-Hot Encoding to test data
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)


In [105]:
# Convert X_train back to DataFrame using original feature names
feature_names = X.columns  # Get original feature names before train-test split
X_train_df = pd.DataFrame(X_train, columns=feature_names)

# Ensure df_test columns match training features
missing_cols = set(X_train_df.columns) - set(df_test.columns)

# Add missing columns with default value 0
for col in missing_cols:
    df_test[col] = 0

# Ensure df_test has the same column order as X_train
df_test = df_test[X_train_df.columns]

print("df_test processed successfully!")


df_test processed successfully!


In [106]:
# Convert X_train back to a DataFrame
feature_names = X.columns  # Assuming X was the original DataFrame before train-test split
X_train_df = pd.DataFrame(X_train, columns=feature_names)

# Step 1: Get categorical columns from X_train (excluding target)
categorical_cols = list(set(X_train_df.select_dtypes(include=['object']).columns) - {"treatment"})

# Step 2: Apply One-Hot Encoding on df_test
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)

# Step 3: Add any missing columns (if some categories are missing in df_test)
missing_cols = set(X_train_df.columns) - set(df_test.columns)

for col in missing_cols:
    df_test[col] = 0  # Add missing columns with default value 0

# Step 4: Ensure column order matches training data
df_test = df_test[X_train_df.columns]

print("df_test processed successfully!")


df_test processed successfully!


In [108]:
# Apply the same encoding as training data
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)



In [111]:
from sklearn.metrics import precision_recall_curve
best_model = best_models["XGBoost"]  # or whichever model performed best

# Get probabilities on validation/test set
y_probs = best_model.predict_proba(X_test)[:, 1]

# Compute Precision-Recall Curve
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

# Find best threshold using F1-score optimization
best_threshold = thresholds[np.argmax(2 * (precisions * recalls) / (precisions + recalls))]

print(f"Best Threshold: {best_threshold:.4f}")


Best Threshold: 0.2243


In [114]:
# Make predictions using the best model
y_test_probs = best_model.predict_proba(df_test)[:, 1]

# Apply best threshold from Step 4
y_test_pred = (y_test_probs >= best_threshold).astype(int)

# Load sample submission file
sample_submission = pd.read_csv(r"C:\Users\Rahul\Desktop\hackathon\sample_submission.csv")





# Create submission DataFrame
submission = pd.DataFrame({
    "S.No": sample_submission["S.No"],  # Keep original IDs
    "treatment": y_test_pred  # Add predictions
})
# Convert 0 and 1 to "Yes" and "No"
submission["treatment"] = submission["treatment"].map({1: "Yes", 0: "No"})

# Save to CSV
submission.to_csv("submission_3.csv", index=False)
print(" Submission file saved successfully!")


 Submission file saved successfully!
