In [3]:
import pandas as pd

df = pd.read_csv("task_turtles_vs_sprint_hares.csv")  # change name if needed

print("Shape:", df.shape)
print("\nColumns:")
print(df.columns)

print("\nFirst 5 rows:")
display(df.head())

print("\nMissing Values:")
display(df.isnull().sum())


Shape: (299, 14)

Columns:
Index(['task_id', 'developer_id', 'task_complexity', 'task_start_time',
       'task_deadline_time', 'actual_finish_time', 'start_delay_min',
       'last_minute_rush', 'focus_rating', 'distractions_count',
       'coffee_intake_mg', 'task_quality_score', 'stress_level',
       'finished_on_time'],
      dtype='str')

First 5 rows:


Unnamed: 0,task_id,developer_id,task_complexity,task_start_time,task_deadline_time,actual_finish_time,start_delay_min,last_minute_rush,focus_rating,distractions_count,coffee_intake_mg,task_quality_score,stress_level,finished_on_time
0,TASK_9935,DEV021,low,2025-06-02 05:08:22,2025-06-02 07:57:22,2025-06-02 08:43:22,140,1,7,2,104,7,8,0
1,TASK_1434,DEV003,high,2025-06-21 11:34:28,2025-06-21 18:54:28,2025-06-21 18:41:28,16,1,6,2,517,9,8,1
2,TASK_3615,DEV018,low,2025-06-27 10:54:10,2025-06-27 13:51:10,2025-06-27 15:42:10,214,1,6,4,6,5,6,0
3,TASK_7224,DEV023,medium,2025-06-17 12:25:14,2025-06-17 15:33:14,2025-06-17 17:42:14,142,1,7,0,94,2,3,0
4,TASK_7201,DEV004,medium,2025-06-17 16:44:14,2025-06-17 20:03:14,2025-06-17 21:10:14,135,1,4,3,127,4,4,0



Missing Values:


task_id               0
developer_id          0
task_complexity       0
task_start_time       0
task_deadline_time    0
actual_finish_time    0
start_delay_min       0
last_minute_rush      0
focus_rating          0
distractions_count    0
coffee_intake_mg      0
task_quality_score    0
stress_level          0
finished_on_time      0
dtype: int64

In [4]:
import pandas as pd
import numpy as np

# Reload clean copy
df = pd.read_csv("task_turtles_vs_sprint_hares.csv")

# Create procrastination label
df["procrastinated"] = df["finished_on_time"].apply(lambda x: 1 if x == 0 else 0)

# Drop leakage & useless columns
df = df.drop(columns=[
    "task_id",
    "developer_id",
    "task_start_time",
    "task_deadline_time",
    "actual_finish_time",
    "finished_on_time"
])

print("New Shape:", df.shape)
display(df.head())


New Shape: (299, 9)


Unnamed: 0,task_complexity,start_delay_min,last_minute_rush,focus_rating,distractions_count,coffee_intake_mg,task_quality_score,stress_level,procrastinated
0,low,140,1,7,2,104,7,8,1
1,high,16,1,6,2,517,9,8,0
2,low,214,1,6,4,6,5,6,1
3,medium,142,1,7,0,94,2,3,1
4,medium,135,1,4,3,127,4,4,1


In [5]:
df = pd.get_dummies(df, columns=["task_complexity"], drop_first=True)

print("After Encoding:")
display(df.head())


After Encoding:


Unnamed: 0,start_delay_min,last_minute_rush,focus_rating,distractions_count,coffee_intake_mg,task_quality_score,stress_level,procrastinated,task_complexity_low,task_complexity_medium
0,140,1,7,2,104,7,8,1,True,False
1,16,1,6,2,517,9,8,0,False,False
2,214,1,6,4,6,5,6,1,True,False
3,142,1,7,0,94,2,3,1,False,True
4,135,1,4,3,127,4,4,1,False,True


In [6]:
print(df["procrastinated"].value_counts())


procrastinated
1    234
0     65
Name: count, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split

X = df.drop("procrastinated", axis=1)
y = df["procrastinated"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (224, 9)
Test shape: (75, 9)


In [8]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

print("After SMOTE:")
print(y_train_bal.value_counts())


After SMOTE:
procrastinated
0    175
1    175
Name: count, dtype: int64


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_test_scaled = scaler.transform(X_test)


In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1,
    random_state=42,
    eval_metric='logloss'
)

model.fit(X_train_scaled, y_train_bal)

y_pred = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.71      0.94      0.81        16
           1       0.98      0.90      0.94        59

    accuracy                           0.91        75
   macro avg       0.85      0.92      0.87        75
weighted avg       0.92      0.91      0.91        75



In [11]:
# Train on FULL dataset (balanced first)

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import joblib

X = df.drop("procrastinated", axis=1)
y = df["procrastinated"]

sm = SMOTE(random_state=42)
X_bal, y_bal = sm.fit_resample(X, y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_bal)

final_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    eval_metric='logloss'
)

final_model.fit(X_scaled, y_bal)

joblib.dump(final_model, "dozy_model.pkl")
joblib.dump(scaler, "dozy_scaler.pkl")

print("Model and scaler saved successfully!")


Model and scaler saved successfully!


In [12]:
joblib.dump(X.columns.tolist(), "feature_columns.pkl")


['feature_columns.pkl']