# Part 2 Task 3: Predictive Analytics for Resource Allocation

**Goal:**
1. Preprocess data (clean, label, split).
2. Train a model (Random Forest) to predict issue priority.
3. Evaluate using accuracy and F1-score.

**Dataset:** sklearn Breast Cancer Dataset (Simulating Issue Priority: Malignant=High, Benign=Low)

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Data Loading and Preprocessing

In [None]:
# Load dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Map target to Issue Priority for the sake of the assignment scenario
# 0 = Malignant (High Priority), 1 = Benign (Low Priority)
df['priority'] = df['target'].map({0: 'High', 1: 'Low'})

print("Dataset Shape:", df.shape)
df.head()

In [None]:
# Check for missing values
print("Missing values:\n", df.isnull().sum().sum())

# Split Data
X = df.drop(['target', 'priority'], axis=1)
y = df['target'] # Using numeric target for training

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

## 2. Model Training (Random Forest)

In [None]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

## 3. Evaluation

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate Metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=['High (Malignant)', 'Low (Benign)']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['High', 'Low'], yticklabels=['High', 'Low'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()