<a href="https://colab.research.google.com/github/Syeda-Eman/Machine_Learning/blob/main/Developer_Stress_Level_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

developer_stress_level_prediction_path = kagglehub.competition_download('developer-stress-level-prediction')

print('Data source import complete.')


# Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Loading data

In [None]:
train = pd.read_csv('/kaggle/input/developer-stress-level-prediction/train.csv')
test = pd.read_csv('/kaggle/input/developer-stress-level-prediction/test.csv')

# Exploring data

In [None]:
train.head()

Unnamed: 0,ID,Hours_Worked,Sleep_Hours,Bugs,Deadline_Days,Coffee_Cups,Meetings,Interruptions,Experience_Years,Code_Complexity,Remote_Work,Stress_Level
0,249,9,6,36,2,6,15,9,Junior,High,No,100.0
1,433,13,3,39,25,5,5,2,Senior,Medium,Yes,100.0
2,19,5,4,50,58,1,1,2,Senior,High,No,100.0
3,322,5,5,39,4,8,1,8,Junior,Medium,Yes,99.504878
4,332,11,4,48,2,2,17,2,Junior,Low,No,100.0


In [None]:
test.head()

Unnamed: 0,ID,Hours_Worked,Sleep_Hours,Bugs,Deadline_Days,Coffee_Cups,Meetings,Interruptions,Experience_Years,Code_Complexity,Remote_Work
0,361,4,4,7,59,9,1,3,Junior,Medium,Yes
1,73,9,4,9,53,0,11,9,Senior,Medium,No
2,374,6,3,13,42,1,18,1,Senior,Low,No
3,155,7,7,41,34,3,17,2,Mid,Low,Yes
4,104,4,6,39,50,3,5,8,Junior,Medium,No


In [None]:
train.shape

(400, 12)

In [None]:
test.shape

(100, 11)

# Preprocessing data and feature engineering

In [None]:

def transform_data(df):
    df = df.copy()

    #mapping strings to numbers so the model can do math
    exp_map = {'Junior': 0, 'Mid': 1, 'Senior': 2}
    comp_map = {'Low': 0, 'Medium': 1, 'High': 2}

    if 'Experience_Years' in df.columns:
        df['Experience_Years'] = df['Experience_Years'].map(exp_map)
    if 'Code_Complexity' in df.columns:
        df['Code_Complexity'] = df['Code_Complexity'].map(comp_map)
    if 'Remote_Work' in df.columns:
        df['Remote_Work'] = df['Remote_Work'].map({'Yes': 1, 'No': 0})

    # FEATURE ENGINEERING
    #we create new features to help lower the RMSE
    df['Work_Load'] = df['Hours_Worked'] * (df['Code_Complexity'] + 1)
    #burnout indicators: high meetings + low sleep
    df['Burnout_Risk'] = df['Meetings'] / (df['Sleep_Hours'] + 1)

    return df

train = transform_data(train)
test = transform_data(test)

# Alligning coolumns and detecting target

In [None]:
#dynamically find the target column (the one in train but not test)
target = list(set(train.columns) - set(test.columns))[0]
X = train.drop(columns=['ID', target])
y = train[target]

#ensure test features match training features exactly
test_features = test[X.columns]
test_ids = test['ID']


# Scaling

In [None]:
#standardizing data helps Gradient Boosting models converge faster
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_features)


# Splitting data

In [None]:
#Splitting data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# Model Training

In [None]:
# HistGradientBoosting is excellent for RMSE optimization
model = HistGradientBoostingRegressor(
    max_iter=400,          #number of trees
    learning_rate=0.04,    #slower learning for better precision
    max_depth=5,           #prevent overfitting
    l2_regularization=1.5, #penalty for complexity
    random_state=42        #random state
)

model.fit(X_train, y_train)

# Checking RMSE

In [None]:
val_preds = model.predict(X_val)
val_preds = np.maximum(val_preds, 0) #ensure no negative bug counts
val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))

print(f"Target Column: {target}")
print(f"Local Validation RMSE: {val_rmse:.4f}")

--- SUCCESS ---
Target Column: Stress_Level
Local Validation RMSE: 6.9836


# Final: Predicting on test data and saving submission

In [None]:
test_preds = model.predict(test_scaled)
test_preds = np.maximum(test_preds, 0) #ensure no negative bug counts

submission = pd.DataFrame({
    'ID': test_ids,
    target: test_preds
})

submission.to_csv('submission.csv', index=False)
print("submission.csv has been saved and is ready for upload!")

submission.csv has been saved and is ready for upload!
