In [1]:
import pandas as pd

df = pd.read_excel('FBI_dataset_cleaned.xlsx')

XGBOOST

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

# Step 1 — Prepare the data
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['YEAR'] = df['Date'].dt.year
df['MONTH'] = df['Date'].dt.month

# Group by year, month, and type
monthly_crime = df.groupby(['YEAR', 'MONTH', 'TYPE']).size().reset_index(name='Crime_Count')

# Step 2 — Encode TYPE
monthly_crime['TYPE_CODE'] = monthly_crime['TYPE'].astype('category').cat.codes

# Save mapping from TYPE_CODE → TYPE
type_map = dict(enumerate(monthly_crime['TYPE'].astype('category').cat.categories))

# Step 3 — Train model
X = monthly_crime[['YEAR', 'MONTH', 'TYPE_CODE']]
y = monthly_crime['Crime_Count']

model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
model.fit(X, y)

# Step 4 — Prepare prediction dataframe
future_rows = []
for year in [2012, 2013]:
    for month in range(1, 13):
        for tcode in monthly_crime['TYPE_CODE'].unique():
            future_rows.append([year, month, tcode])

future_df = pd.DataFrame(future_rows, columns=['YEAR', 'MONTH', 'TYPE_CODE'])

# Step 5 — Keep TYPE names
future_df['TYPE'] = future_df['TYPE_CODE'].map(type_map)

# Step 6 — Predict
future_df['Crime_Count_Predicted'] = model.predict(future_df[['YEAR', 'MONTH', 'TYPE_CODE']])
future_df['Crime_Count_Predicted'] = future_df['Crime_Count_Predicted'].round().astype(int)

# Step 7 — Clean output
final_df = future_df[['YEAR', 'MONTH', 'TYPE', 'Crime_Count_Predicted']]

print(final_df.head(20))
final_df.to_csv("fbi_predictions_xgboost.csv", index=False)


    YEAR  MONTH                                               TYPE  \
0   2012      1                         Break and Enter Commercial   
1   2012      1                  Break and Enter Residential/Other   
2   2012      1                                           Mischief   
3   2012      1                           Offence Against a Person   
4   2012      1                                        Other Theft   
5   2012      1                                 Theft from Vehicle   
6   2012      1                                   Theft of Bicycle   
7   2012      1                                   Theft of Vehicle   
8   2012      1  Vehicle Collision or Pedestrian Struck (with I...   
9   2012      2                         Break and Enter Commercial   
10  2012      2                  Break and Enter Residential/Other   
11  2012      2                                           Mischief   
12  2012      2                           Offence Against a Person   
13  2012      2     

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 1. Prepare Data
# ==============================
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['YEAR_MONTH'] = df['Date'].dt.to_period('M')

# All possible months + crime types
all_months = pd.period_range(start=df['YEAR_MONTH'].min(), end=df['YEAR_MONTH'].max(), freq='M')
all_types = df['TYPE'].unique()

# Count crimes per month + type
monthly_crime_type = df.groupby(['YEAR_MONTH', 'TYPE']).size().reset_index(name='Crime_Count')

# Fill missing months/types with 0
all_combinations = pd.MultiIndex.from_product([all_months, all_types], names=['YEAR_MONTH', 'TYPE'])
monthly_crime_type = (
    monthly_crime_type.set_index(['YEAR_MONTH', 'TYPE'])
    .reindex(all_combinations, fill_value=0)
    .reset_index()
)

# Sort
monthly_crime_type = monthly_crime_type.sort_values(['TYPE', 'YEAR_MONTH']).reset_index(drop=True)

# ==============================
# 2. Feature Engineering
# ==============================
feature_dfs = []
for crime_type in monthly_crime_type['TYPE'].unique():
    type_df = monthly_crime_type[monthly_crime_type['TYPE'] == crime_type].copy()

    # Lags (short-term + seasonal)
    for lag in [1, 2, 3, 6, 12]:
        type_df[f'Lag_{lag}'] = type_df['Crime_Count'].shift(lag)

    # Rolling averages
    type_df['Rolling_3'] = type_df['Crime_Count'].shift(1).rolling(window=3).mean()
    type_df['Rolling_6'] = type_df['Crime_Count'].shift(1).rolling(window=6).mean()

    # Calendar features
    type_df['YEAR'] = type_df['YEAR_MONTH'].dt.year
    type_df['MONTH'] = type_df['YEAR_MONTH'].dt.month

    # Drop first rows (where lags are NaN)
    type_df = type_df.dropna().reset_index(drop=True)
    feature_dfs.append(type_df)

monthly_crime_features = pd.concat(feature_dfs, ignore_index=True)

# ==============================
# 3. Train/Test Split (Check Accuracy)
# ==============================
train_data = monthly_crime_features[monthly_crime_features['YEAR'] < 2011]
test_data = monthly_crime_features[monthly_crime_features['YEAR'] == 2011]

X_train = train_data.drop(['YEAR_MONTH', 'TYPE', 'Crime_Count'], axis=1)
y_train = train_data['Crime_Count']

X_test = test_data.drop(['YEAR_MONTH', 'TYPE', 'Crime_Count'], axis=1)
y_test = test_data['Crime_Count']

# XGBoost Model
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("📊 Accuracy Check (2011 test data):")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# ==============================
# 4. Forecast for 2012 and 2013
# ==============================
future_predictions = []

for crime_type in monthly_crime_features['TYPE'].unique():
    type_data = monthly_crime_features[monthly_crime_features['TYPE'] == crime_type].copy()

    # Train model on this type’s full data
    X_train_type = type_data.drop(['YEAR_MONTH', 'TYPE', 'Crime_Count'], axis=1)
    y_train_type = type_data['Crime_Count']

    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model.fit(X_train_type, y_train_type)

    last_known = type_data.iloc[-1].copy()

    for i in range(24):  # forecast 24 months
        year_month = last_known['YEAR_MONTH'] + 1

        new_row = {
            "YEAR_MONTH": year_month,
            "TYPE": crime_type,
            "YEAR": year_month.year,
            "MONTH": year_month.month
        }

        # Build lag features
        for lag in [1, 2, 3, 6, 12]:
            col = f'Lag_{lag}'
            if lag == 1:
                new_row[col] = last_known['Crime_Count']
            else:
                prev_index = type_data[type_data['YEAR_MONTH'] == year_month - lag]
                if not prev_index.empty:
                    new_row[col] = prev_index.iloc[0]['Crime_Count']
                else:
                    new_row[col] = last_known['Crime_Count']

        # Rolling averages
        new_row['Rolling_3'] = np.mean([new_row[f'Lag_{l}'] for l in [1,2,3]])
        new_row['Rolling_6'] = np.mean([new_row[f'Lag_{l}'] for l in [1,2,3,6]])

        # Match training features
        X_new = pd.DataFrame([{col: new_row[col] for col in X_train_type.columns}])

        pred = model.predict(X_new)[0]
        new_row['Crime_Count'] = int(round(pred))

        future_predictions.append(new_row)

        # Update last_known + dataset
        last_known = pd.Series(new_row)
        type_data = pd.concat([type_data, pd.DataFrame([new_row])], ignore_index=True)

# ==============================
# 5. Save Forecast
# ==============================
future_df = pd.DataFrame(future_predictions)
future_df = future_df[['YEAR', 'MONTH', 'TYPE', 'Crime_Count']]

print(future_df.head(20))
future_df.to_csv("fbi_forecast_xgboost.csv", index=False)
