In [2]:
# ===============================
# Team Prediction ML Pipeline (Using Team Performance)
# ===============================

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# -------------------------------
# 1️⃣ Load CSV
# -------------------------------
df = pd.read_csv('C:/Users/musny/Downloads/logs_1000.csv')

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# -------------------------------
# 2️⃣ Encode work type
# -------------------------------
le_work = LabelEncoder()
df['work_type_encoded'] = le_work.fit_transform(df['type_of_work'])

# -------------------------------
# 3️⃣ Compute team-level features
# -------------------------------
# Calculate cost efficiency
df['cost_efficiency'] = (df['estimated_cost'] - df['final_cost']) / df['estimated_cost']

# Compute team-level average time and cost efficiency per work type
team_stats = (
    df.groupby(['team_id', 'work_type_encoded'])
    .agg({'time_taken': 'mean', 'cost_efficiency': 'mean'})
    .reset_index()
    .rename(columns={'time_taken': 'avg_time', 'cost_efficiency': 'avg_cost_efficiency'})
)

# Merge team-level features back to original data
df = df.merge(team_stats, on=['team_id', 'work_type_encoded'], how='left')

# -------------------------------
# 4️⃣ Features & target
# -------------------------------
X = df[['work_type_encoded', 'avg_time', 'avg_cost_efficiency']]
y = df['team_id']

# -------------------------------
# 5️⃣ Split train & test
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------------
# 6️⃣ Train Gradient Boosting Classifier
# -------------------------------
clf = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
clf.fit(X_train, y_train)

# -------------------------------
# 7️⃣ Evaluate model
# -------------------------------
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# -------------------------------
# 8️⃣ Predict best team for a new task
# -------------------------------
def predict_team(work_type: str) -> int:
    """
    Predict the best team for a task based on work type and historical performance.
    """
    work_encoded = le_work.transform([work_type])[0]

    # Filter and make a copy to avoid SettingWithCopyWarning
    candidates = team_stats[team_stats['work_type_encoded'] == work_encoded].copy()

    # Compute performance score (higher is better)
    candidates['performance_score'] = (
        candidates['avg_cost_efficiency'] * 0.6
        + (1 / candidates['avg_time']) * 0.4
    )

    # Pick the team with the highest performance score
    best_team = (
        candidates.sort_values('performance_score', ascending=False)
        .iloc[0]['team_id']
    )

    return int(best_team)


# -------------------------------
# 9️⃣ Example usage
# -------------------------------
example_team = predict_team('Repair')
print(f"✅ Recommended team for this task: {example_team}")


Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        22
           3       1.00      1.00      1.00        20
           4       1.00      1.00      1.00        20
           5       1.00      1.00      1.00        13
           6       1.00      1.00      1.00        18
           7       1.00      1.00      1.00        19
           8       1.00      1.00      1.00        23
           9       1.00      1.00      1.00        29
          10       1.00      1.00      1.00        13

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

✅ Recommended team for this task: 6


In [3]:
# -------------------------------
# Top N Recommended Teams Function (No time input)
# -------------------------------
import numpy as np

def predict_top_teams(work_type: str, estimated_cost: float, top_n: int = 3):
    # Filter historical tasks for the given work type
    candidates = df[df['type_of_work'] == work_type].copy()
    
    # Compute team-level averages for this work type
    team_stats = candidates.groupby('team_id').agg(
        avg_time=('time_taken', 'mean'),
        avg_cost_efficiency=('final_cost', lambda x: ((x - candidates.loc[x.index, 'estimated_cost']).abs().mean()))
    ).reset_index()
    
    # Compute a performance score combining cost efficiency and speed
    # Higher cost efficiency and lower avg_time => higher score
    team_stats['performance_score'] = team_stats['avg_cost_efficiency'] * 0.6 + (1 / team_stats['avg_time']) * 0.4
    
    # Sort by performance_score descending
    top_teams_df = team_stats.sort_values(by='performance_score', ascending=False).head(top_n)
    
    # Return as list of tuples (team_id, performance_score)
    top_teams = list(zip(top_teams_df['team_id'], top_teams_df['performance_score']))
    return top_teams

# Example usage
top_teams = predict_top_teams('Repair', 3000)
print("Top recommended teams (team_id, performance_score):")
for team, score in top_teams:
    print(f"Team {team}: {score:.2f}")


Top recommended teams (team_id, performance_score):
Team 10: 21.18
Team 3: 19.71
Team 4: 17.30
