<a href="https://colab.research.google.com/github/RyhLim/BMCS2203-AI-Supervised-Artificial-Intelligence/blob/main/Crime_Prediction_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title 🔧 Install Required Packages
!pip install pandas scikit-learn matplotlib seaborn imbalanced-learn --quiet

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages installed and imported successfully!")

✅ All packages installed and imported successfully!


In [None]:
# @title 📁 Upload Crime Dataset
from google.colab import files
import io

print("Please upload your crime_data.csv file:")
uploaded = files.upload()

# Get the filename
file_name = list(uploaded.keys())[0]
print(f"✅ Uploaded: {file_name}")

# Load the data
df = pd.read_csv(io.BytesIO(uploaded[file_name]))
print(f"📊 Dataset shape: {df.shape}")
print("\n📋 Columns:")
print(df.columns.tolist())
print("\n🔍 First 5 rows:")
display(df.head())

Please upload your crime_data.csv file:


In [None]:
# @title 🔍 Explore Dataset
print("📈 Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

print("\n📊 Missing Values:")
print(df.isnull().sum())

print("\n🎯 Target Variable Analysis (common crime columns):")
crime_columns = ['crime_type', 'Category', 'Primary Type', 'type', 'crime', 'OFFENSE_CODE_GROUP']
for col in crime_columns:
    if col in df.columns:
        print(f"\n--- {col} ---")
        print(f"Unique values: {df[col].nunique()}")
        print("Value counts:")
        print(df[col].value_counts().head())
        break
else:
    print("No common crime column found. Showing first categorical column:")
    cat_cols = df.select_dtypes(include=['object']).columns
    if len(cat_cols) > 0:
        col = cat_cols[0]
        print(f"Column: {col}")
        print(f"Unique values: {df[col].nunique()}")
        print("Value counts:")
        print(df[col].value_counts().head())

In [None]:
# @title 🧹 Data Preprocessing Class
class CrimeDataPreprocessor:
    def __init__(self, data):
        self.data = data.copy()
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.target_encoder = None

    def clean_data(self):
        """Clean and preprocess the data"""
        print("🧹 Cleaning data...")

        # Handle missing values
        initial_shape = self.data.shape
        self.data = self.data.dropna()
        print(f"   Removed {initial_shape[0] - self.data.shape[0]} rows with missing values")

        # Remove duplicates
        self.data = self.data.drop_duplicates()
        print(f"   Removed duplicates, final shape: {self.data.shape}")

        return self.data

    def find_target_column(self):
        """Automatically find the target column"""
        crime_columns = ['crime_type', 'Category', 'Primary Type', 'type', 'crime', 'OFFENSE_CODE_GROUP']
        for col in crime_columns:
            if col in self.data.columns:
                return col
        # If no common crime column, use first categorical column
        cat_cols = self.data.select_dtypes(include=['object']).columns
        return cat_cols[0] if len(cat_cols) > 0 else None

    def encode_features(self, target_column):
        """Encode categorical features"""
        print("🔤 Encoding categorical features...")

        categorical_cols = self.data.select_dtypes(include=['object']).columns

        for col in categorical_cols:
            if col != target_column:
                le = LabelEncoder()
                self.data[col] = le.fit_transform(self.data[col].astype(str))
                self.label_encoders[col] = le
                print(f"   Encoded: {col}")

        # Encode target variable
        self.target_encoder = LabelEncoder()
        y = self.target_encoder.fit_transform(self.data[target_column])

        print(f"   Target classes: {list(self.target_encoder.classes_)}")
        return y

    def prepare_data(self, target_column=None):
        """Prepare the complete dataset"""
        # Clean data
        self.clean_data()

        # Find target column if not specified
        if target_column is None:
            target_column = self.find_target_column()
            print(f"🎯 Auto-detected target column: {target_column}")

        # Encode features and get target
        y = self.encode_features(target_column)
        X = self.data.drop(target_column, axis=1)

        # Scale numerical features
        numerical_cols = X.select_dtypes(include=[np.number]).columns
        if len(numerical_cols) > 0:
            X[numerical_cols] = self.scaler.fit_transform(X[numerical_cols])

        return X, y, self.target_encoder

    def handle_imbalance(self, X, y):
        """Handle class imbalance using SMOTE"""
        print("⚖️ Handling class imbalance with SMOTE...")
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        print(f"   After SMOTE: X.shape={X_resampled.shape}, y.shape={y_resampled.shape}")
        return X_resampled, y_resampled

    def split_data(self, X, y, test_size=0.2):
        """Split data into train and test sets"""
        return train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

In [None]:
# @title 🤖 Machine Learning Models Class
class CrimePredictionModels:
    def __init__(self):
        self.models = {
            'Random Forest': RandomForestClassifier(random_state=42),
            'SVM': SVC(random_state=42, probability=True),
            'KNN': KNeighborsClassifier()
        }

        self.param_grids = {
            'Random Forest': {
                'n_estimators': [100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5]
            },
            'SVM': {
                'C': [0.1, 1, 10],
                'kernel': ['linear', 'rbf'],
                'gamma': ['scale', 'auto']
            },
            'KNN': {
                'n_neighbors': [3, 5, 7],
                'weights': ['uniform', 'distance']
            }
        }

        self.best_models = {}
        self.cv_results = {}

    def train_models(self, X_train, y_train, cv=3):
        """Train multiple models with hyperparameter tuning"""
        results = {}

        for name, model in self.models.items():
            print(f"\n🔧 Training {name}...")

            grid_search = GridSearchCV(
                model,
                self.param_grids[name],
                cv=cv,
                scoring='accuracy',
                n_jobs=-1
            )

            grid_search.fit(X_train, y_train)

            self.best_models[name] = grid_search.best_estimator_

            results[name] = {
                'best_score': grid_search.best_score_,
                'best_params': grid_search.best_params_
            }

            print(f"   ✅ Best CV Accuracy: {grid_search.best_score_:.4f}")
            print(f"   ⚙️ Best parameters: {grid_search.best_params_}")

        return results

    def evaluate_models(self, X_test, y_test, target_encoder):
        """Evaluate all trained models"""
        evaluation_results = {}

        for name, model in self.best_models.items():
            print(f"\n📊 Evaluating {name}...")

            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None

            accuracy = accuracy_score(y_test, y_pred)

            evaluation_results[name] = {
                'accuracy': accuracy,
                'predictions': y_pred,
                'probabilities': y_pred_proba,
                'model': model
            }

            print(f"   ✅ Test Accuracy: {accuracy:.4f}")
            print("   📋 Classification Report:")
            print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))

        return evaluation_results

In [None]:
# @title 📊 Visualization Functions
def plot_confusion_matrix(y_true, y_pred, class_names, model_name):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def plot_feature_importance(model, feature_names, model_name, top_n=10):
    """Plot feature importance for tree-based models"""
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1][:top_n]

        plt.figure(figsize=(12, 6))
        plt.title(f"Feature Importance - {model_name}")
        plt.bar(range(top_n), importances[indices[:top_n]])
        plt.xticks(range(top_n), [feature_names[i] for i in indices[:top_n]], rotation=45)
        plt.tight_layout()
        plt.show()

def compare_models(evaluation_results):
    """Compare model performance"""
    comparison_data = []

    for model_name, results in evaluation_results.items():
        comparison_data.append({
            'Model': model_name,
            'Accuracy': results['accuracy']
        })

    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

    plt.figure(figsize=(10, 6))
    plt.bar(comparison_df['Model'], comparison_df['Accuracy'])
    plt.title('Model Accuracy Comparison')
    plt.ylabel('Accuracy')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)

    for i, v in enumerate(comparison_df['Accuracy']):
        plt.text(i, v + 0.01, f'{v:.3f}', ha='center')

    plt.tight_layout()
    plt.show()

    return comparison_df

In [None]:
# @title 🛠️ Fix Data Issues - RUN THIS FIRST
print("🛠️ Fixing data issues...")

# Remove classes with too few samples
def filter_rare_classes(y, min_samples=5):
    """Remove classes with too few samples"""
    unique, counts = np.unique(y, return_counts=True)
    print(f"Original class distribution: {dict(zip(unique, counts))}")

    # Find classes with enough samples
    valid_classes = unique[counts >= min_samples]

    # Filter data
    mask = np.isin(y, valid_classes)
    y_filtered = y[mask]

    print(f"After filtering (min_samples={min_samples}): {len(valid_classes)} classes remaining")
    print(f"Removed {len(y) - len(y_filtered)} samples from rare classes")

    return y_filtered, mask

# Apply the fix
y_filtered, mask = filter_rare_classes(y, min_samples=5)
X_filtered = X[mask]

print(f"📊 New dataset shape: X={X_filtered.shape}, y={y_filtered.shape}")

In [None]:
# @title 🚀 Main Execution - Run Complete Project (FIXED)
print("🚀 Starting Crime Prediction Project...")
print("="*50)

# Initialize preprocessor
preprocessor = CrimeDataPreprocessor(df)

# Prepare data
X, y, target_encoder = preprocessor.prepare_data()

# 🛠️ FIX: Remove rare classes (THIS SHOULD ALREADY BE DONE FROM SOLUTION 1)
print("\n🛠️ Removing rare crime classes...")
y_filtered, mask = filter_rare_classes(y, min_samples=5)
X_filtered = X[mask]

# Handle imbalance
X_resampled, y_resampled = preprocessor.handle_imbalance(X_filtered, y_filtered)

# Split data
X_train, X_test, y_train, y_test = preprocessor.split_data(X_resampled, y_resampled)
print(f"\n📊 Data split: Train={X_train.shape}, Test={X_test.shape}")

# Train models
print("\n" + "="*50)
print("🤖 TRAINING MODELS")
print("="*50)
model_trainer = CrimePredictionModels()
training_results = model_trainer.train_models(X_train, y_train)

# Evaluate models
print("\n" + "="*50)
print("📊 EVALUATING MODELS")
print("="*50)
evaluation_results = model_trainer.evaluate_models(X_test, y_test, target_encoder)

# Visualizations
print("\n" + "="*50)
print("📈 VISUALIZATIONS")
print("="*50)

# Compare models
comparison_df = compare_models(evaluation_results)
print("\n🏆 Model Comparison:")
display(comparison_df)

# Plot feature importance for Random Forest
if 'Random Forest' in evaluation_results:
    plot_feature_importance(
        evaluation_results['Random Forest']['model'],
        X.columns.tolist(),
        'Random Forest'
    )

# Plot confusion matrix for best model
best_model_name = comparison_df.iloc[0]['Model']
y_pred_best = evaluation_results[best_model_name]['predictions']
plot_confusion_matrix(y_test, y_pred_best, target_encoder.classes_, best_model_name)

print("\n🎉 PROJECT COMPLETED SUCCESSFULLY!")
print(f"🏆 Best Model: {best_model_name} (Accuracy: {comparison_df.iloc[0]['Accuracy']:.4f})")

In [None]:
# @title 🔍 Dataset Explorer - Find Better Target
print("🔍 Exploring your dataset to find a better target column...")

print("📊 Dataset shape:", df.shape)
print("📋 Columns:", df.columns.tolist())

print("\n🎯 Checking all columns for potential targets:")
for col in df.columns:
    print(f"\n--- {col} ---")
    print(f"Data type: {df[col].dtype}")
    print(f"Unique values: {df[col].nunique()}")
    if df[col].nunique() < 20:  # Show value counts for columns with few unique values
        print("Value counts:")
        print(df[col].value_counts())
    else:
        print("Too many unique values for display")

    # Check if this could be a good target
    if df[col].nunique() > 1 and df[col].nunique() < 50:
        print("✅ Potential target column!")
    print("-" * 30)

In [None]:
# @title 🎯 SELECT TARGET: Part 1-2 (Binary Classification)
TARGET_COLUMN = 'Part 1-2'

print(f"🎯 Using target: {TARGET_COLUMN}")
print("Value distribution:")
print(df[TARGET_COLUMN].value_counts())
print(f"Total samples: {len(df)}")

# Check if we have enough samples per class
value_counts = df[TARGET_COLUMN].value_counts()
print(f"✅ Perfect for binary classification!")
print(f"   Class 1: {value_counts[1]} samples")
print(f"   Class 2: {value_counts[2]} samples")

In [None]:
# @title 🛠️ DATA CLEANING FIX
print("🛠️ Performing additional data cleaning...")

# Check for any missing values in features
print("Missing values in X:")
print(X.isnull().sum().sum())

# Check for any missing values in target
print("Missing values in y:", np.isnan(y).sum())

# Remove any rows with missing values
valid_mask = ~np.isnan(y)
X_clean = X[valid_mask]
y_clean = y[valid_mask]

print(f"After cleaning NaN: X={X_clean.shape}, y={y_clean.shape}")

# Also check for infinite values
print("Infinite values in X:", np.isinf(X_clean.to_numpy()).sum())
print("Infinite values in y:", np.isinf(y_clean).sum())

# Check class distribution after cleaning
unique, counts = np.unique(y_clean, return_counts=True)
print("Clean target distribution:", dict(zip(unique, counts)))

# Update X and y
X = X_clean
y = y_clean

In [None]:
# @title 🛠️ BETTER DATA CLEANING
print("🛠️ Fixing data cleaning issues...")

# The problem: too many missing values causing most rows to be deleted
# Let's use a smarter cleaning approach

# 1. First, let's see which columns have too many missing values
print("Missing values per column:")
missing_percent = (df.isnull().sum() / len(df)) * 100
print(missing_percent.sort_values(ascending=False))

# 2. Remove columns with too many missing values (>50%)
columns_to_drop = missing_percent[missing_percent > 50].index.tolist()
print(f"\nDropping columns with >50% missing: {columns_to_drop}")
df_clean = df.drop(columns=columns_to_drop)

# 3. For remaining columns, fill missing values instead of dropping rows
print("\nFilling missing values...")
for col in df_clean.columns:
    if df_clean[col].isnull().sum() > 0:
        if df_clean[col].dtype == 'object':
            # For categorical, fill with most common value
            df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
        else:
            # For numerical, fill with median
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())

print(f"✅ Cleaned dataset shape: {df_clean.shape}")
print("Missing values after cleaning:", df_clean.isnull().sum().sum())

In [None]:
# @title 🚀 MAIN EXECUTION WITH Part 1-2 TARGET
from sklearn.linear_model import LogisticRegression # Import LogisticRegression

print("🚀 Starting Crime Prediction with Part 1-2 Target...")
print("="*60)

# Initialize preprocessor with our target
preprocessor = CrimeDataPreprocessor(df)
X, y, target_encoder = preprocessor.prepare_data(target_column='Part 1-2')

print(f"📊 Dataset prepared: X={X.shape}, y={y.shape}")
unique_classes, class_counts = np.unique(y, return_counts=True)
print("🎯 Target distribution:", dict(zip(unique_classes, class_counts)))

# Check if SMOTE is possible (requires at least n_neighbors + 1 samples in minority class)
min_samples_for_smote = 6 # Default n_neighbors is 5, so need 5+1 = 6 samples
if min(class_counts) < min_samples_for_smote:
    print(f"\n⚠️ Skipping SMOTE: Minority class has less than {min_samples_for_smote} samples.")
    X_resampled, y_resampled = X, y # Use original data if SMOTE is not possible
else:
    print("\n⚖️ Handling class imbalance with SMOTE...")
    X_resampled, y_resampled = preprocessor.handle_imbalance(X, y)
    print(f"   After SMOTE: X={X_resampled.shape}, y={y_resampled.shape}")


# Split data
# 🛠️ FIX: Ensure stratify is only used if there are at least 2 samples per class in the resampled data
if min(np.unique(y_resampled, return_counts=True)[1]) >= 2:
    X_train, X_test, y_train, y_test = preprocessor.split_data(X_resampled, y_resampled, test_size=0.2)
    print(f"📊 Data split (stratified): Train={X_train.shape}, Test={X_test.shape}")
else:
    print("\n⚠️ Skipping stratification: Not enough samples per class in resampled data.")
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    print(f"📊 Data split (no stratification): Train={X_train.shape}, Test={X_test.shape}")


# Train models
print("\n" + "="*50)
print("🤖 TRAINING MODELS")
print("="*50)

# Use simpler models without grid search (for small dataset)
models = {
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=3)
}

evaluation_results = {}

for name, model in models.items():
    print(f"🔧 Training {name}...")
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    evaluation_results[name] = {
        'accuracy': accuracy,
        'predictions': y_pred,
        'model': model
    }

    print(f"   ✅ Test Accuracy: {accuracy:.4f}")

# Visualizations
print("\n" + "="*50)
print("📈 VISUALIZATIONS")
print("="*50)

# Compare models
comparison_data = []
for name, results in evaluation_results.items():
    comparison_data.append({
        'Model': name,
        'Accuracy': results['accuracy']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

plt.figure(figsize=(10, 6))
plt.bar(comparison_df['Model'], comparison_df['Accuracy'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, v in enumerate(comparison_df['Accuracy']):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')
plt.show()

print("\n🏆 Model Comparison:")
display(comparison_df)

# Plot feature importance for Random Forest
if 'Random Forest' in evaluation_results:
    rf_model = evaluation_results['Random Forest']['model']
    if hasattr(rf_model, 'feature_importances_'):
        plt.figure(figsize=(12, 6))
        importances = rf_model.feature_importances_
        indices = np.argsort(importances)[::-1][:min(8, len(importances))]
        plt.bar(range(len(indices)), importances[indices])
        plt.xticks(range(len(indices)), [X.columns[i] for i in indices], rotation=45)
        plt.title('Feature Importance - Random Forest')
        plt.tight_layout()
        plt.show()

print("\n🎉 PROJECT COMPLETED SUCCESSFULLY!")
best_model = comparison_df.iloc[0]
print(f"🏆 Best Model: {best_model['Model']} (Accuracy: {best_model['Accuracy']:.4f})")

In [None]:
# @title 🏆 PROFESSIONAL DATA PREPROCESSING PIPELINE
print("🏆 Implementing Professional Data Preprocessing...")
print("="*60)

# Make a copy of the original data to preserve it
df_pro = df.copy()
print(f"📊 Original dataset shape: {df_pro.shape}")

# 1. 📈 EXPLORATORY DATA ANALYSIS (EDA)
print("\n" + "="*50)
print("🔍 STEP 1: EXPLORATORY DATA ANALYSIS")
print("="*50)

# Missing value analysis
missing_percent = (df_pro.isnull().sum() / len(df_pro)) * 100
missing_df = pd.DataFrame({
    'column': missing_percent.index,
    'missing_percentage': missing_percent.values
}).sort_values('missing_percentage', ascending=False)

print("📉 Missing Value Analysis:")
display(missing_df.head(10))

# Visualize missing values
plt.figure(figsize=(12, 8))
sns.heatmap(df_pro.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Values Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# 2. 🧹 SMART DATA CLEANING STRATEGY
print("\n" + "="*50)
print("🧹 STEP 2: SMART DATA CLEANING")
print("="*50)

# Strategy: Different handling for different missing value percentages
def smart_missing_value_handling(df, threshold_drop=70, threshold_rare=5):
    """
    Professional missing value handling strategy:
    - > threshold_drop% missing: Drop column
    - threshold_rare% - threshold_drop% missing: Advanced imputation
    - < threshold_rare% missing: Simple imputation
    """
    df_clean = df.copy()
    missing_percent = (df_clean.isnull().sum() / len(df_clean)) * 100

    # Drop columns with too many missing values
    cols_to_drop = missing_percent[missing_percent > threshold_drop].index.tolist()
    df_clean = df_clean.drop(columns=cols_to_drop)
    print(f"❌ Dropped {len(cols_to_drop)} columns with >{threshold_drop}% missing: {cols_to_drop}")

    # Advanced imputation for moderate missing values
    cols_moderate = missing_percent[(missing_percent > threshold_rare) &
                                  (missing_percent <= threshold_drop)].index.tolist()

    # Simple imputation for few missing values
    cols_few = missing_percent[missing_percent <= threshold_rare].index.tolist()

    return df_clean, cols_moderate, cols_few

# Apply smart cleaning
df_clean, moderate_cols, few_cols = smart_missing_value_handling(df_pro)

print(f"✅ Columns for advanced imputation ({len(moderate_cols)}): {moderate_cols}")
print(f"✅ Columns for simple imputation ({len(few_cols)}): {few_cols}")

# 3. 🎯 ADVANCED IMPUTATION TECHNIQUES
print("\n" + "="*50)
print("🎯 STEP 3: ADVANCED IMPUTATION TECHNIQUES")
print("="*50)

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
import warnings
warnings.filterwarnings('ignore')

# For numerical columns with moderate missingness: MICE imputation
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

print("🔧 Applying advanced imputation techniques...")

# MICE Imputation for numerical columns (state-of-the-art)
if numerical_cols:
    mice_imputer = IterativeImputer(random_state=42, max_iter=10)
    df_clean[numerical_cols] = mice_imputer.fit_transform(df_clean[numerical_cols])
    print("✅ Applied MICE imputation for numerical columns")

# KNN Imputation for categorical columns
if categorical_cols:
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            # Fill with mode for now (could use fancyimpute for KNN categorical)
            df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
    print("✅ Applied mode imputation for categorical columns")

# 4. 📊 FEATURE ENGINEERING
print("\n" + "="*50)
print("📊 STEP 4: FEATURE ENGINEERING")
print("="*50)

# Create new features from existing data
def create_new_features(df):
    """Create advanced features for better prediction"""
    df_eng = df.copy()

    # Extract time features from date columns
    date_columns = ['Date Rptd', 'DATE OCC']
    for col in date_columns:
        if col in df_eng.columns:
            try:
                df_eng[col] = pd.to_datetime(df_eng[col], errors='coerce')
                df_eng[f'{col}_year'] = df_eng[col].dt.year
                df_eng[f'{col}_month'] = df_eng[col].dt.month
                df_eng[f'{col}_day'] = df_eng[col].dt.day
                df_eng[f'{col}_dayofweek'] = df_eng[col].dt.dayofweek
                df_eng = df_eng.drop(col, axis=1)
            except:
                pass

    # Create time-based features
    if 'TIME OCC' in df_eng.columns:
        df_eng['hour_of_day'] = df_eng['TIME OCC'] % 24
        df_eng['is_night'] = ((df_eng['hour_of_day'] >= 20) | (df_eng['hour_of_day'] <= 6)).astype(int)
        df_eng['time_of_day'] = pd.cut(df_eng['hour_of_day'],
                                     bins=[0, 6, 12, 18, 24],
                                     labels=['Night', 'Morning', 'Afternoon', 'Evening'])

    # Create age groups if victim age exists
    if 'Vict Age' in df_eng.columns:
        df_eng['age_group'] = pd.cut(df_eng['Vict Age'],
                                   bins=[0, 18, 30, 50, 100],
                                   labels=['Child', 'Young Adult', 'Adult', 'Senior'])

    return df_eng

df_engineered = create_new_features(df_clean)
print(f"✅ Created {len(df_engineered.columns) - len(df_clean.columns)} new features!")
print(f"📊 Engineered dataset shape: {df_engineered.shape}")

# 5. 🎯 TARGET SPECIFICATION
print("\n" + "="*50)
print("🎯 STEP 5: TARGET SPECIFICATION & FINAL PREP")
print("="*50)

# Use Part 1-2 as target
target_column = 'Part 1-2'

if target_column in df_engineered.columns:
    # Check class balance
    class_distribution = df_engineered[target_column].value_counts()
    print("🎯 Target Distribution:")
    print(class_distribution)
    print(f"📈 Class Balance Ratio: {class_distribution.max()/class_distribution.min():.2f}:1")

    # Prepare final dataset
    X = df_engineered.drop(target_column, axis=1)
    y = df_engineered[target_column]

    print(f"✅ Final dataset prepared: X={X.shape}, y={y.shape}")

    # Handle categorical variables with advanced encoding
    categorical_cols = X.select_dtypes(include=['object']).columns
    print(f"🔤 {len(categorical_cols)} categorical columns to encode")

else:
    print(f"❌ Target column '{target_column}' not found!")

print("\n🎉 PROFESSIONAL PREPROCESSING COMPLETED!")
print("="*60)

In [None]:
# @title 🚀 ADVANCED MODELING PIPELINE
print("🚀 Starting Advanced Modeling Pipeline...")
print("="*60)

# 1. 🎯 ADVANCED ENCODING
print("🔤 Applying advanced encoding techniques...")

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Separate features
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"📊 Numerical features: {len(numerical_cols)}")
print(f"📊 Categorical features: {len(categorical_cols)}")

# Create preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

# 2. 🎯 ADVANCED CLASS IMBALANCE HANDLING
print("\n⚖️ Handling class imbalance with advanced techniques...")

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_imbalance_pipeline

# Since we have large dataset, we can use combination of over and under sampling
smote = SMOTE(random_state=42, k_neighbors=5)
under = RandomUnderSampler(random_state=42)

# 3. 🎯 ADVANCED MODEL SELECTION
print("\n🤖 Setting up advanced model ensemble...")

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Define advanced models
advanced_models = {
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

# 4. 🎯 COMPREHENSIVE EVALUATION
print("\n📊 Setting up comprehensive evaluation...")

from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, confusion_matrix,
                           classification_report, roc_curve, auc)

# Create evaluation function
def comprehensive_evaluation(model, X_test, y_test, model_name):
    """Comprehensive model evaluation with multiple metrics"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1-Score': f1_score(y_test, y_pred, average='weighted'),
    }

    if y_proba is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_test, y_proba)

    # Cross-validation scores
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_test, y_test, cv=cv, scoring='accuracy')
    metrics['CV Mean'] = cv_scores.mean()
    metrics['CV Std'] = cv_scores.std()

    return metrics, y_pred, y_proba

# 5. 🎯 MAIN TRAINING LOOP
print("\n" + "="*50)
print("🏋️ TRAINING ADVANCED MODELS")
print("="*50)

# Preprocess the data
X_processed = preprocessor.fit_transform(X)
feature_names = (numerical_cols +
                list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)))

print(f"📊 Processed features: {X_processed.shape}")

# Handle imbalance
X_resampled, y_resampled = smote.fit_resample(X_processed, y)
print(f"⚖️ After SMOTE: {X_resampled.shape}, {y_resampled.shape}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

print(f"📊 Final split: Train={X_train.shape}, Test={X_test.shape}")

# Train and evaluate all models
results = {}
predictions = {}

for name, model in advanced_models.items():
    print(f"\n🔧 Training {name}...")

    # Train model
    model.fit(X_train, y_train)

    # Comprehensive evaluation
    metrics, y_pred, y_proba = comprehensive_evaluation(model, X_test, y_test, name)
    results[name] = metrics
    predictions[name] = (y_pred, y_proba)

    print(f"   ✅ Accuracy: {metrics['Accuracy']:.4f}")
    print(f"   📊 F1-Score: {metrics['F1-Score']:.4f}")
    if 'ROC-AUC' in metrics:
        print(f"   📈 ROC-AUC: {metrics['ROC-AUC']:.4f}")

# 6. 📊 COMPREHENSIVE RESULTS ANALYSIS
print("\n" + "="*50)
print("📈 COMPREHENSIVE RESULTS ANALYSIS")
print("="*50)

# Create results dataframe
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('Accuracy', ascending=False)

print("🏆 Model Performance Comparison:")
display(results_df.style.background_gradient(cmap='viridis', subset=['Accuracy', 'F1-Score', 'ROC-AUC']))

# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Accuracy comparison
axes[0,0].bar(results_df.index, results_df['Accuracy'], color='skyblue')
axes[0,0].set_title('Model Accuracy Comparison', fontweight='bold')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].tick_params(axis='x', rotation=45)

# F1-Score comparison
axes[0,1].bar(results_df.index, results_df['F1-Score'], color='lightgreen')
axes[0,1].set_title('Model F1-Score Comparison', fontweight='bold')
axes[0,1].set_ylabel('F1-Score')
axes[0,1].tick_params(axis='x', rotation=45)

# ROC-AUC comparison (if available)
if 'ROC-AUC' in results_df.columns:
    axes[1,0].bar(results_df.index, results_df['ROC-AUC'], color='salmon')
    axes[1,0].set_title('Model ROC-AUC Comparison', fontweight='bold')
    axes[1,0].set_ylabel('ROC-AUC')
    axes[1,0].tick_params(axis='x', rotation=45)

# CV Stability
axes[1,1].errorbar(results_df.index, results_df['CV Mean'],
                  yerr=results_df['CV Std'], fmt='o', capsize=5)
axes[1,1].set_title('Cross-Validation Stability', fontweight='bold')
axes[1,1].set_ylabel('CV Accuracy ± Std')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("🎉 ADVANCED MODELING COMPLETED SUCCESSFULLY!")
best_model = results_df.iloc[0]
print(f"🏆 BEST MODEL: {best_model.name} (Accuracy: {best_model['Accuracy']:.4f}, F1: {best_model['F1-Score']:.4f})")