<a href="https://colab.research.google.com/github/Srisaivarsha27/cat-safety-sim/blob/main/dynamic_task_operator_match.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#                        Problem Statement
In large-scale industrial operations, matching the right operator to the right task is critical for efficiency, safety, and timely completion. This project builds a machine learning model to:

1) Classify whether a given task-operator pair is a good match (✅ or ❌).

2) Predict the time it would take for the operator to complete the task.

# Step 1: Upload Dataset
Upload the dataset from your local system into the Colab environment.

In [None]:
from google.colab import files
uploaded = files.upload()  # Upload your CSV dataset


Saving operator_task_dataset.csv to operator_task_dataset (4).csv


# Step 2: Import Required Libraries
Import essential libraries for data handling, model building, and evaluation.


In [None]:
# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error
import joblib

# 📌 Step 2: Load Dataset
df = pd.read_csv("operator_task_dataset.csv")
df.head()

Unnamed: 0,Operator_ID,Experience_Yrs,Familiarity_Score,Safety_Score,Fatigue_Score,Previous_Accident_Count,Tasks_Completed_Count,Task_Type,Terrain,Task_Complexity_Level,Max_Allowed_Time,Weather_Condition,Machine_Type,Time_Taken,Is_Good_Match,Shift_Hours_Worked,Operator_Preference_Level,Seatbelt_Compliance_Rate
0,OP1163,5.0,0.82,0.65,0.59,0,119,Load,Rocky,Medium,18.87,Rainy,DZR002,22.85,0,6,Low,0.99
1,OP1026,3.9,0.77,0.07,0.64,0,85,Lift,Rocky,Medium,40.91,Sunny,EXC001,39.9,1,9,Low,0.95
2,OP1059,4.4,0.75,0.72,0.46,2,106,Lift,Rocky,Medium,55.31,Windy,DZR002,62.47,0,6,Medium,0.6
3,OP1071,3.8,0.93,0.63,0.47,1,85,Load,Muddy,Medium,62.94,Cloudy,LDR003,66.01,0,10,Low,0.9
4,OP1195,3.0,0.67,0.46,0.21,2,48,Carry,Rocky,Low,48.46,Sunny,LDR003,44.22,1,10,High,0.9


# Display Column Names
Quick check to view all column names in the dataset for understanding structure.


In [None]:
df.columns

Index(['Operator_ID', 'Experience_Yrs', 'Familiarity_Score', 'Safety_Score',
       'Fatigue_Score', 'Previous_Accident_Count', 'Tasks_Completed_Count',
       'Task_Type', 'Terrain', 'Task_Complexity_Level', 'Max_Allowed_Time',
       'Weather_Condition', 'Machine_Type', 'Time_Taken', 'Is_Good_Match',
       'Shift_Hours_Worked', 'Operator_Preference_Level',
       'Seatbelt_Compliance_Rate'],
      dtype='object')

# Step 4: View Unique Categories
Print unique values in task type and machine type to understand categorical features.


In [None]:
print("✅ Unique task types:", df['Task_Type'].unique())
print("✅ Unique machine types:", df['Machine_Type'].unique())

✅ Unique task types: ['Load' 'Lift' 'Carry' 'Dump' 'Dig']
✅ Unique machine types: ['DZR002' 'EXC001' 'LDR003' 'BLD004']


# Step 5: Label Encoding
Convert categorical columns to numeric using label encoding for ML compatibility.


In [None]:
# 📌 Step 3: Label Encoding for categoricals
categorical_cols = ['Operator_ID', 'Task_Type', 'Terrain', 'Task_Complexity_Level',
                    'Weather_Condition', 'Machine_Type', 'Operator_Preference_Level']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoder

# Step 6: Feature/Target Split

In [None]:
# 📌 Step 6: Feature/Target Split
feature_cols = [col for col in df.columns if col not in ['Is_Good_Match', 'Time_Taken']]
X = df[feature_cols]

# Define target column for both models

In [None]:
# Classification target
y_class = df['Is_Good_Match']

# Regression target
y_reg = df['Time_Taken']

# Train-Test Split

In [None]:

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)


# Feature Scaling

In [None]:
# 📌 Step 6: Feature Scaling
scaler = StandardScaler()
X_train_c = scaler.fit_transform(X_train_c)
X_test_c = scaler.transform(X_test_c)
X_train_r = scaler.fit_transform(X_train_r)
X_test_r = scaler.transform(X_test_r)

# Train Classification Model

In [None]:
# 📌 Step 7: Train Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_c, y_train_c)
y_pred_c = clf.predict(X_test_c)
print("🎯 Classification Accuracy:", accuracy_score(y_test_c, y_pred_c))


# Train Regression Model

In [None]:
# 📌 Step 8: Train Regression Model
reg = RandomForestRegressor(random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)
print("🕒 Regression MAE:", mean_absolute_error(y_test_r, y_pred_r))


# Save ordered feature names

In [None]:
# Save ordered feature names
ordered_features = X.columns.tolist()

# Save everything
joblib.dump(clf, 'classifier_model.pkl')
joblib.dump(reg, 'regression_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(ordered_features, 'ordered_features.pkl')  # 👈 Add this line

print("✅ Models saved successfully!")


🎯 Classification Accuracy: 0.911
🕒 Regression MAE: 2.7734616
✅ Models saved successfully!


# Load Ordered Features

In [None]:
ordered_features = joblib.load("ordered_features.pkl")  # 👈 Add this line


# Define Prediction Function

In [None]:
def predict_operator_fit_and_time(input_data):
    X_input = preprocess_input(
        operator_id=input_data["operator_id"],
        task_type=input_data["task_type"],
        skill_level=input_data["skill_level"],
        machine_type=input_data["machine_type"],
        success_rate=input_data["success_rate"]
    )
    is_good_match = clf.predict(X_input)[0]
    estimated_time = reg.predict(X_input)[0]
    return {
        "task_id": input_data["task_id"],
        "is_good_match": bool(is_good_match),
        "estimated_time": round(estimated_time, 2)
    }


# Load Models and Encoders


In [None]:
import joblib
import numpy as np

# Load trained models and preprocessors
clf = joblib.load("classifier_model.pkl")
reg = joblib.load("regression_model.pkl")
scaler = joblib.load("scaler.pkl")
label_encoders = joblib.load("label_encoders.pkl")

# Define mapping for skill level
skill_to_exp = {"beginner": 1.5, "intermediate": 3.5, "expert": 5.5}
skill_to_familiarity = {"beginner": 0.5, "intermediate": 0.75, "expert": 0.9}
skill_to_pref = {"beginner": "Low", "intermediate": "Medium", "expert": "High"}

# ✅ Input from Streamlit-like form
input_data = {
    "task_id": "TASK001",
    "operator_id": "OP1059",
    "task_type": "Load",
    "skill_level": "expert",
    "machine_type": "DZR002",
    "success_rate": 85
}

# Function to preprocess and create feature vector
def preprocess_input(operator_id, task_type, skill_level, machine_type, success_rate):
    row = {
        "Operator_ID": operator_id,
        "Experience_Yrs": skill_to_exp[skill_level],
        "Familiarity_Score": skill_to_familiarity[skill_level],
        "Safety_Score": 0.85,
        "Fatigue_Score": 0.2,
        "Previous_Accident_Count": 0,
        "Tasks_Completed_Count": int(success_rate),
        "Task_Type": task_type,
        "Terrain": "Rocky",
        "Task_Complexity_Level": "Medium",
        "Max_Allowed_Time": 50.0,
        "Weather_Condition": "Sunny",
        "Machine_Type": machine_type,
        "Shift_Hours_Worked": 8,
        "Operator_Preference_Level": skill_to_pref[skill_level],
        "Seatbelt_Compliance_Rate": 0.9,
    }

    # Apply label encoding
    for col in label_encoders:
        row[col] = label_encoders[col].transform([row[col]])[0]

    # Arrange features in correct order
    global ordered_features  # if defined outside
    X = np.array([[row[col] for col in ordered_features]])

    # Scale numeric features
    X_scaled = scaler.transform(X)
    return X_scaled

# Preprocess and predict
X_input = preprocess_input(
    operator_id=input_data["operator_id"],
    task_type=input_data["task_type"],
    skill_level=input_data["skill_level"],
    machine_type=input_data["machine_type"],
    success_rate=input_data["success_rate"]
)

# Classification (Is Good Match)
is_good_match = clf.predict(X_input)[0]

# Regression (Estimated Time)
estimated_time = reg.predict(X_input)[0]

# ✅ Final result
result = {
    "task_id": input_data["task_id"],
    "is_good_match": bool(is_good_match),
    "estimated_time": round(estimated_time, 2)
}

print("🎯 Prediction Result:")
print(result)


🎯 Prediction Result:
{'task_id': 'TASK001', 'is_good_match': False, 'estimated_time': np.float64(56.98)}




Sample Input #1


In [None]:
input_data = {
    "task_id": "TASK_A1",
    "operator_id": "OP1002",
    "task_type": "Dig",
    "skill_level": "expert",
    "machine_type": "DZR002",  # ✅ known machine type
    "success_rate": 95
}

X_input = preprocess_input(
    operator_id=input_data["operator_id"],
    task_type=input_data["task_type"],
    skill_level=input_data["skill_level"],
    machine_type=input_data["machine_type"],
    success_rate=input_data["success_rate"]
)

is_good_match = clf.predict(X_input)[0]
estimated_time = reg.predict(X_input)[0]

result = {
    "task_id": input_data["task_id"],
    "is_good_match": bool(is_good_match),
    "estimated_time": round(estimated_time, 2)
}

print("🎯 Prediction Result:")
print(result)


🎯 Prediction Result:
{'task_id': 'TASK_A1', 'is_good_match': False, 'estimated_time': np.float64(56.66)}




# Sample Input #2


In [None]:
input_data = {
    "task_id": "TASK_B2",
    "operator_id": "OP1011",
    "task_type": "Dump",
    "skill_level": "intermediate",
    "machine_type": "LDR003",  # ✅ known
    "success_rate": 70
}

X_input = preprocess_input(
    operator_id=input_data["operator_id"],
    task_type=input_data["task_type"],
    skill_level=input_data["skill_level"],
    machine_type=input_data["machine_type"],
    success_rate=input_data["success_rate"]
)

is_good_match = clf.predict(X_input)[0]
estimated_time = reg.predict(X_input)[0]

result = {
    "task_id": input_data["task_id"],
    "is_good_match": bool(is_good_match),
    "estimated_time": round(estimated_time, 2)
}

print("🎯 Prediction Result:")
print(result)


🎯 Prediction Result:
{'task_id': 'TASK_B2', 'is_good_match': False, 'estimated_time': np.float64(58.84)}




In [None]:
input_data = {
    "task_id": "TASK_B3",
    "operator_id": "OP1024",
    "task_type": "Dig",
    "skill_level": "beginner",
    "machine_type": "DZR002",  # ✅ known
    "success_rate": 45
}

X_input = preprocess_input(
    operator_id=input_data["operator_id"],
    task_type=input_data["task_type"],
    skill_level=input_data["skill_level"],
    machine_type=input_data["machine_type"],
    success_rate=input_data["success_rate"]
)

is_good_match = clf.predict(X_input)[0]
estimated_time = reg.predict(X_input)[0]

result = {
    "task_id": input_data["task_id"],
    "is_good_match": bool(is_good_match),
    "estimated_time": round(estimated_time, 2)
}

print("🎯 Prediction Result:")
print(result)


🎯 Prediction Result:
{'task_id': 'TASK_B3', 'is_good_match': False, 'estimated_time': np.float64(60.14)}




In [None]:
input_data = {
    "task_id": "TASK_C1",
    "operator_id": "OP1050",
    "task_type": "Dig",
    "skill_level": "expert",
    "machine_type": "DZR002",  # ✅ known
    "success_rate": 100         # ✅ very high
}

X_input = preprocess_input(
    operator_id=input_data["operator_id"],
    task_type=input_data["task_type"],
    skill_level=input_data["skill_level"],
    machine_type=input_data["machine_type"],
    success_rate=input_data["success_rate"]
)

is_good_match = clf.predict(X_input)[0]
estimated_time = reg.predict(X_input)[0]

result = {
    "task_id": input_data["task_id"],
    "is_good_match": bool(is_good_match),
    "estimated_time": round(estimated_time, 2)
}

print("🎯 Prediction Result:")
print(result)


🎯 Prediction Result:
{'task_id': 'TASK_C1', 'is_good_match': False, 'estimated_time': np.float64(56.36)}




# Justified Reasons for Imperfect Predictions
1. Limited or Imbalanced Dataset
2. Operator ID and Categorical Encoding Bias
3. Operator ID and Categorical Encoding Bias