In [3]:
# Cell 1 — Imports & config (edit DATA_PATH if needed)
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Config — change DATA_PATH to your CSV filename if different
DATA_PATH = r"C:\Users\sreek\Downloads\HR_Engagement_Survey_Data_with_Question_Details.csv"
DROP_THRESHOLD = 30       # minutes threshold used in the rule
RANDOM_STATE = 42
BATCH_SIZE = 256
EPOCHS = 20
LR = 1e-3

# Reproducibility
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)


In [4]:
# Cell 2 — Load the data (robustly) and show a quick snapshot
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"{DATA_PATH} not found. Upload the file to the notebook or change DATA_PATH.")

# read with low_memory=False to avoid mixed-type chunk warnings
df = pd.read_csv(DATA_PATH, low_memory=False)
print("Loaded:", DATA_PATH)
print("Shape:", df.shape)
display(df.head())
print("Columns:", df.columns.tolist())

# Helpful check: show if required columns exist for the rule
required_cols = ['Mood Before Workout', 'Mood After Workout', 'Workout Intensity', 'Workout Duration (mins)']
for c in required_cols:
    print(f"{c}: {'FOUND' if c in df.columns else 'MISSING'}")


Loaded: C:\Users\sreek\Downloads\HR_Engagement_Survey_Data_with_Question_Details.csv
Shape: (132549, 12)


Unnamed: 0,Year,Status,Role,Department,Director,Manager,Supervisor,Lead,Staff,Question,Answer_Numeric,Answer_Text
0,2020 Jun,Complete,Staff,Human Services,False,False,False,False,True,"07. At work, my opinions seem to count.",3,Agree
1,2020 Jun,Complete,Staff,District Court,False,False,False,False,True,"07. At work, my opinions seem to count.",3,Agree
2,2020 Jun,Complete,Staff,Superior Court,False,False,False,False,True,"07. At work, my opinions seem to count.",3,Agree
3,2021 May,Complete,Lead,Human Services,False,False,False,True,False,"07. At work, my opinions seem to count.",3,Agree
4,2019 May,Partial,,,False,False,False,False,False,01. I know what is expected of me at work.,0,


Columns: ['Year', 'Status', 'Role', 'Department', 'Director', 'Manager', 'Supervisor', 'Lead', 'Staff', 'Question', 'Answer_Numeric', 'Answer_Text']
Mood Before Workout: MISSING
Mood After Workout: MISSING
Workout Intensity: MISSING
Workout Duration (mins): MISSING


In [5]:
# Cell 3 — Create the rule-based dropout label (safe to run even if some columns are missing)
def generate_dropout_label(row, threshold=DROP_THRESHOLD):
    before = str(row.get('Mood Before Workout', '')).strip()
    after = str(row.get('Mood After Workout', '')).strip()
    intensity = str(row.get('Workout Intensity', '')).strip()
    dur_raw = row.get('Workout Duration (mins)', '')
    try:
        duration = float(str(dur_raw).strip()) if str(dur_raw).strip() != '' else np.nan
    except:
        duration = np.nan

    # Your rule:
    if (
        before in ['Tired', 'Stressed'] and
        after in ['Fatigued', 'Neutral'] and
        intensity in ['Low'] and
        (not np.isnan(duration)) and
        duration < threshold
    ):
        return 1
    return 0

# Apply and inspect distribution
df['dropout'] = df.apply(generate_dropout_label, axis=1)
print("Dropout label counts:")
print(df['dropout'].value_counts(dropna=False))


Dropout label counts:
dropout
0    132549
Name: count, dtype: int64


In [6]:
# Cell 4 — Preprocessing: separate features & target, handle mixed types (robust)
# NOTE: we convert categorical columns to strings to avoid OneHotEncoder mixed-type errors

# Features / target
X = df.drop(columns=['dropout']).copy()
y = df['dropout'].astype(int).copy()

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
# treat everything else as numeric-ish; we'll coerce to numeric
numeric_cols = [c for c in X.columns if c not in categorical_cols]

print("Categorical cols (converted to str):", categorical_cols)
print("Numeric-ish cols (will be coerced):", numeric_cols[:10], "... (total", len(numeric_cols), ")")

# Convert categorical columns to string (prevents 'bool' vs 'str' mix errors)
if len(categorical_cols) > 0:
    X[categorical_cols] = X[categorical_cols].astype(str).fillna('')

# Coerce numeric-ish columns to numeric (non-convertible -> NaN), then fill NaNs
for col in numeric_cols:
    X[col] = pd.to_numeric(X[col], errors='coerce')
# Replace numeric NaNs with 0 (or median if you prefer)
X[numeric_cols] = X[numeric_cols].fillna(0.0)

print("After coercion: any NaNs left (numeric)?", X[numeric_cols].isna().any().any())


Categorical cols (converted to str): ['Year', 'Status', 'Role', 'Department', 'Director', 'Manager', 'Supervisor', 'Lead', 'Staff', 'Question', 'Answer_Text']
Numeric-ish cols (will be coerced): ['Answer_Numeric'] ... (total 1 )
After coercion: any NaNs left (numeric)? False


In [7]:
# Cell 5 — Encode categorical features and scale numeric features (robust to sklearn versions)
# OneHotEncoder parameter name changed between sklearn versions; try both.
if len(categorical_cols) > 0:
    try:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    except TypeError:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_cat = encoder.fit_transform(X[categorical_cols])
else:
    encoder = None
    X_cat = np.empty((len(X), 0))

# Scale numeric features
if len(numeric_cols) > 0:
    scaler = StandardScaler()
    X_num = scaler.fit_transform(X[numeric_cols])
else:
    scaler = None
    X_num = np.empty((len(X), 0))

# Combine numeric + categorical
X_processed = np.hstack([X_num, X_cat])
print("Processed feature matrix shape:", X_processed.shape)

# Save column lists so you can reuse the encoder/scaler later
feature_info = {
    'numeric_cols': numeric_cols,
    'categorical_cols': categorical_cols
}


Processed feature matrix shape: (132549, 77)


In [8]:
# Cell 6 — Train / Test split (stratified if both classes present)
stratify = y if y.nunique() > 1 else None
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=RANDOM_STATE, stratify=stratify
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (106039, 77) Test shape: (26510, 77)


In [9]:
# Cell 7 — Prepare PyTorch DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

train_ds = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)


Using device: cpu


In [10]:
# Cell 8 — Define the MLP (ReLU hidden layers, Sigmoid output as requested)
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()   # returns probability
        )
    def forward(self, x):
        return self.net(x)

model = MLP(X_train.shape[1]).to(device)
criterion = nn.BCELoss()   # because we use Sigmoid above
optimizer = optim.Adam(model.parameters(), lr=LR)

print(model)


MLP(
  (net): Sequential(
    (0): Linear(in_features=77, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
    (5): Sigmoid()
  )
)


In [11]:
# Cell 9 — Training loop (prints progress)
for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss = 0.0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)
    if epoch % 5 == 0 or epoch == 1 or epoch == EPOCHS:
        print(f"Epoch {epoch}/{EPOCHS} — loss: {epoch_loss:.4f}")


Epoch 1/20 — loss: 0.0489
Epoch 5/20 — loss: 0.0000
Epoch 10/20 — loss: 0.0000
Epoch 15/20 — loss: 0.0000
Epoch 20/20 — loss: 0.0000


In [None]:
# Cell 10 — Evaluate on test set (Accuracy, Precision, Recall, F1)
model.eval()
with torch.no_grad():
    # predict in batches to avoid memory issues
    probs = []
    start = 0
    while start < X_test.shape[0]:
        end = start + BATCH_SIZE
        xb = torch.tensor(X_test[start:end], dtype=torch.float32).to(device)
        p = model(xb).cpu().numpy().flatten()
        probs.extend(p)
        start = end
    probs = np.array(probs)
preds = (probs >= 0.5).astype(int)
y_true = y_test.values.astype(int)

acc = accuracy_score(y_true, preds)
prec = precision_score(y_true, preds, zero_division=0)
rec = recall_score(y_true, preds, zero_division=0)
f1 = f1_score(y_true, preds, zero_division=0)

print("\nTest performance:")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1 score : {f1:.4f}")


In [None]:
# Cell 11 — Save model + preprocessing artifacts (optional)
torch.save(model.state_dict(), "dropout_mlp_state.pth")
joblib.dump({'encoder': encoder, 'scaler': scaler, 'feature_info': feature_info}, "preprocessing_artifacts.joblib")
print("Saved model and preprocessing_artifacts.joblib")


In [None]:
# OPTIONAL Cell 12 — sklearn fallback (no PyTorch required)
# Run this cell if you prefer to train an sklearn MLPClassifier instead of the PyTorch model.
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(64,32), activation='relu', solver='adam', max_iter=300, random_state=RANDOM_STATE)
clf.fit(X_train, y_train)
y_skl = clf.predict(X_test)
print("sklearn MLP — Accuracy:", accuracy_score(y_test, y_skl),
      "Precision:", precision_score(y_test, y_skl, zero_division=0),
      "Recall:", recall_score(y_test, y_skl, zero_division=0),
      "F1:", f1_score(y_test, y_skl, zero_division=0))
