In [1]:
"""
Predictive Analytics for Resource Allocation
Using Breast Cancer Dataset as proxy for issue classification
Task: Predict issue priority (high/medium/low) based on features
"""

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

print("=" * 70)
print("PREDICTIVE ANALYTICS FOR RESOURCE ALLOCATION")
print("Using ML to predict issue priority levels")
print("=" * 70)

# ============================================================================
# STEP 1: LOAD AND PREPARE DATA
# ============================================================================
print("\n[STEP 1] Loading Dataset...")

# Load breast cancer dataset (proxy for software issues)
# Features represent issue characteristics (complexity metrics, dependencies, etc.)
# Target represents priority levels
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Convert binary classification to multi-class (simulating priority levels)
# Original: 0=malignant, 1=benign
# New: 0=high priority, 1=medium priority, 2=low priority
# We'll create a synthetic third class for demonstration
np.random.seed(42)
y_multiclass = y.copy()

# Convert some medium priority (1) to low priority (2)
medium_indices = np.where(y_multiclass == 1)[0]
low_priority_indices = np.random.choice(medium_indices, size=len(medium_indices)//3, replace=False)
y_multiclass[low_priority_indices] = 2

# Rename for clarity
priority_mapping = {0: 'High', 1: 'Medium', 2: 'Low'}
priority_labels = [priority_mapping[i] for i in y_multiclass]

print(f"Dataset Shape: {X.shape}")
print(f"Number of Features: {X.shape[1]}")
print(f"Number of Samples: {X.shape[0]}")
print(f"\nPriority Distribution:")
print(pd.Series(priority_labels).value_counts().sort_index())

# ============================================================================
# STEP 2: DATA PREPROCESSING
# ============================================================================
print("\n[STEP 2] Data Preprocessing...")

# Check for missing values
print(f"Missing values: {X.isnull().sum().sum()}")

# Feature statistics
print("\nFeature Statistics:")
print(X.describe().iloc[:, :5])  # Show first 5 features

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_multiclass, test_size=0.2, random_state=42, stratify=y_multiclass
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Feature scaling (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Features scaled using StandardScaler")

# ============================================================================
# STEP 3: MODEL TRAINING
# ============================================================================
print("\n[STEP 3] Training Random Forest Model...")

# Initialize Random Forest Classifier
model = RandomForestClassifier(
    n_estimators=100,      # Number of trees
    max_depth=10,          # Maximum depth of trees
    min_samples_split=5,   # Minimum samples to split a node
    min_samples_leaf=2,    # Minimum samples in leaf node
    random_state=42,
    n_jobs=-1              # Use all CPU cores
)

# Train the model
model.fit(X_train_scaled, y_train)
print("✓ Model trained successfully")

# ============================================================================
# STEP 4: MODEL EVALUATION
# ============================================================================
print("\n[STEP 4] Model Evaluation...")

# Make predictions
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

# Calculate F1-score (weighted average for multi-class)
train_f1 = f1_score(y_train, y_pred_train, average='weighted')
test_f1 = f1_score(y_test, y_pred_test, average='weighted')

print("\n" + "=" * 70)
print("PERFORMANCE METRICS")
print("=" * 70)
print(f"\nTraining Set:")
print(f"  Accuracy:  {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"  F1-Score:  {train_f1:.4f}")

print(f"\nTesting Set:")
print(f"  Accuracy:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"  F1-Score:  {test_f1:.4f}")

# Detailed classification report
print("\n" + "=" * 70)
print("CLASSIFICATION REPORT (Test Set)")
print("=" * 70)
print(classification_report(
    y_test,
    y_pred_test,
    target_names=['High Priority', 'Medium Priority', 'Low Priority']
))

# Confusion Matrix
print("\n" + "=" * 70)
print("CONFUSION MATRIX")
print("=" * 70)
cm = confusion_matrix(y_test, y_pred_test)
print(cm)
print("\nRows: Actual Priority | Columns: Predicted Priority")
print("Order: High (0), Medium (1), Low (2)")

# ============================================================================
# STEP 5: FEATURE IMPORTANCE ANALYSIS
# ============================================================================
print("\n[STEP 5] Feature Importance Analysis...")

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

# ============================================================================
# STEP 6: MODEL INSIGHTS
# ============================================================================
print("\n" + "=" * 70)
print("MODEL INSIGHTS & RECOMMENDATIONS")
print("=" * 70)

print("""
1. PERFORMANCE ANALYSIS:
   - High accuracy indicates the model can reliably predict issue priorities
   - F1-score measures balance between precision and recall
   - Good generalization if test metrics are close to training metrics

2. RESOURCE ALLOCATION IMPLICATIONS:
   - High Priority issues: Allocate senior developers immediately
   - Medium Priority: Schedule within current sprint
   - Low Priority: Add to backlog for future sprints

3. FEATURE IMPORTANCE:
   - Top features reveal key indicators of issue severity
   - Focus on these metrics during issue triage
   - Can be used for automated routing

4. CONTINUOUS IMPROVEMENT:
   - Retrain model monthly with new labeled data
   - Monitor for concept drift (changing patterns)
   - Collect feedback on prediction accuracy
""")

# ============================================================================
# STEP 7: SAMPLE PREDICTIONS
# ============================================================================
print("\n[STEP 7] Sample Predictions...")

# Show some example predictions
sample_size = 5
sample_indices = np.random.choice(len(X_test), sample_size, replace=False)

print("\nSample Issue Predictions:")
print("-" * 70)
for idx in sample_indices:
    actual = priority_mapping[y_test.iloc[idx]]
    predicted = priority_mapping[y_pred_test[idx]]
    confidence = model.predict_proba(X_test_scaled[idx].reshape(1, -1))[0]
    max_confidence = confidence.max() * 100

    match_symbol = "✓" if actual == predicted else "✗"
    print(f"{match_symbol} Issue #{idx}")
    print(f"  Actual: {actual} | Predicted: {predicted} | Confidence: {max_confidence:.1f}%")

print("\n" + "=" * 70)
print("ANALYSIS COMPLETE")
print("=" * 70)

# Save performance summary
performance_summary = {
    'Model': 'Random Forest Classifier',
    'Training Accuracy': f"{train_accuracy:.4f}",
    'Testing Accuracy': f"{test_accuracy:.4f}",
    'Training F1-Score': f"{train_f1:.4f}",
    'Testing F1-Score': f"{test_f1:.4f}",
    'Dataset Size': X.shape[0],
    'Number of Features': X.shape[1],
    'Classes': 3
}

print("\nPerformance Summary:")
for key, value in performance_summary.items():
    print(f"  {key}: {value}")

Matplotlib is building the font cache; this may take a moment.


PREDICTIVE ANALYTICS FOR RESOURCE ALLOCATION
Using ML to predict issue priority levels

[STEP 1] Loading Dataset...
Dataset Shape: (569, 30)
Number of Features: 30
Number of Samples: 569

Priority Distribution:
High      212
Low       119
Medium    238
Name: count, dtype: int64

[STEP 2] Data Preprocessing...
Missing values: 0

Feature Statistics:
       mean radius  mean texture  mean perimeter    mean area  mean smoothness
count   569.000000    569.000000      569.000000   569.000000       569.000000
mean     14.127292     19.289649       91.969033   654.889104         0.096360
std       3.524049      4.301036       24.298981   351.914129         0.014064
min       6.981000      9.710000       43.790000   143.500000         0.052630
25%      11.700000     16.170000       75.170000   420.300000         0.086370
50%      13.370000     18.840000       86.240000   551.100000         0.095870
75%      15.780000     21.800000      104.100000   782.700000         0.105300
max      28.110000