# Predictive Analytics for Resource Allocation

## Using Kaggle Breast Cancer Dataset

This notebook demonstrates:
1. Data preprocessing (clean, label, split)
2. Training a Random Forest model to predict issue priority (high/medium/low)
3. Evaluation using accuracy and F1-score


## 1. Import Required Libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)


## 2. Load and Explore the Dataset


In [None]:
# Load the breast cancer dataset
cancer_data = load_breast_cancer()

# Create a DataFrame
df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
df['target'] = cancer_data.target

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()


In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum().sum())

# Display dataset info
print("\nDataset Info:")
df.info()

# Display basic statistics
print("\nBasic Statistics:")
df.describe()


## 3. Data Preprocessing

### 3.1 Create Priority Labels (High/Medium/Low)

For resource allocation, we'll map the diagnosis to priority levels:
- **High Priority**: Malignant cases (target=0) - require immediate attention
- **Low Priority**: Benign cases (target=1) - routine monitoring
- **Medium Priority**: We'll create this based on feature combinations (e.g., borderline cases)


In [None]:
# Create priority labels based on diagnosis and feature combinations
# High priority: Malignant (target=0)
# Low priority: Benign (target=1) with low risk features
# Medium priority: Benign cases with some concerning features

def create_priority_labels(row):
    """
    Create priority labels based on diagnosis and key features
    """
    # High priority: Malignant cases
    if row['target'] == 0:
        return 'high'
    
    # For benign cases, check for concerning features
    # Using mean radius and worst area as indicators
    mean_radius = row['mean radius']
    worst_area = row['worst area']
    
    # Medium priority: Benign but with elevated risk indicators
    if mean_radius > df['mean radius'].quantile(0.7) or worst_area > df['worst area'].quantile(0.7):
        return 'medium'
    
    # Low priority: Benign with low risk indicators
    return 'low'

# Apply the function to create priority labels
df['priority'] = df.apply(create_priority_labels, axis=1)

# Display priority distribution
print("Priority Distribution:")
print(df['priority'].value_counts())
print("\nPriority Distribution (%):")
print(df['priority'].value_counts(normalize=True) * 100)


In [None]:
# Visualize priority distribution
plt.figure(figsize=(8, 6))
df['priority'].value_counts().plot(kind='bar', color=['red', 'orange', 'green'])
plt.title('Priority Distribution')
plt.xlabel('Priority Level')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


### 3.2 Clean and Prepare Features


In [None]:
# Separate features and target
X = df.drop(['target', 'priority'], axis=1)
y = df['priority']

# Encode priority labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Feature shape:", X.shape)
print("Target shape:", y_encoded.shape)
print("\nLabel mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")


In [None]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for better readability
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("Features standardized successfully")
print("\nScaled features statistics:")
X_scaled_df.describe()


### 3.3 Split Data into Training and Testing Sets


In [None]:
# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set priority distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  {label_encoder.inverse_transform([u])[0]}: {c} ({c/len(y_train)*100:.1f}%)")
print(f"\nTest set priority distribution:")
unique, counts = np.unique(y_test, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  {label_encoder.inverse_transform([u])[0]}: {c} ({c/len(y_test)*100:.1f}%)")


## 4. Train Random Forest Model


In [None]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# Train the model
print("Training Random Forest model...")
rf_model.fit(X_train, y_train)
print("Model training completed!")


In [None]:
# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

print("Predictions generated successfully!")


## 5. Model Evaluation


In [None]:
# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Calculate F1-score (macro average for multi-class)
train_f1 = f1_score(y_train, y_train_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

# Calculate F1-score per class
train_f1_per_class = f1_score(y_train, y_train_pred, average=None)
test_f1_per_class = f1_score(y_test, y_test_pred, average=None)

print("=" * 60)
print("MODEL PERFORMANCE METRICS")
print("=" * 60)
print(f"\nTraining Set:")
print(f"  Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"  F1-Score (Macro): {train_f1:.4f}")
print(f"\nTest Set:")
print(f"  Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"  F1-Score (Macro): {test_f1:.4f}")
print("\n" + "=" * 60)
print("F1-Score per Class (Test Set):")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {label.capitalize()}: {test_f1_per_class[i]:.4f}")
print("=" * 60)


In [None]:
# Detailed classification report
print("\nDetailed Classification Report (Test Set):")
print("=" * 60)
print(classification_report(
    y_test, y_test_pred, 
    target_names=label_encoder.classes_,
    digits=4
))


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - Test Set')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()


In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print("=" * 60)
print(feature_importance.head(10).to_string(index=False))

# Visualize top features
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 15 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


## 6. Performance Summary


In [None]:
# Create a summary DataFrame
performance_summary = pd.DataFrame({
    'Metric': ['Accuracy', 'F1-Score (Macro)', 'F1-Score (High)', 'F1-Score (Medium)', 'F1-Score (Low)'],
    'Training Set': [
        f"{train_accuracy:.4f}",
        f"{train_f1:.4f}",
        f"{train_f1_per_class[0]:.4f}",
        f"{train_f1_per_class[1]:.4f}",
        f"{train_f1_per_class[2]:.4f}"
    ],
    'Test Set': [
        f"{test_accuracy:.4f}",
        f"{test_f1:.4f}",
        f"{test_f1_per_class[0]:.4f}",
        f"{test_f1_per_class[1]:.4f}",
        f"{test_f1_per_class[2]:.4f}"
    ]
})

print("\n" + "=" * 70)
print("FINAL PERFORMANCE METRICS SUMMARY")
print("=" * 70)
print(performance_summary.to_string(index=False))
print("=" * 70)


## 7. Conclusion

This notebook demonstrates:
1. ✅ **Data Preprocessing**: Successfully cleaned, labeled, and split the dataset
2. ✅ **Model Training**: Trained a Random Forest classifier to predict priority levels
3. ✅ **Model Evaluation**: Evaluated using accuracy and F1-score metrics

The model can be used for resource allocation by prioritizing cases based on predicted priority levels.
