Inference

In [4]:
import pandas as pd
test_df = pd.read_csv("raw/test.csv",sep=",")
train_df = pd.read_csv("raw/train.csv",sep=",")
test_df.head()

Unnamed: 0,YEAR,MONTH,TYPE,Incident_Counts
0,2013,6,Vehicle Collision or Pedestrian Struck (with I...,
1,2013,6,Theft of Vehicle,
2,2013,6,Theft of Bicycle,
3,2013,6,Theft from Vehicle,
4,2013,6,Other Theft,


In [5]:
test_df.shape

(162, 4)

In [6]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode categorical columns using the same LabelEncoder used for training
le_TYPE = LabelEncoder()
test_df['TYPE_encoded'] = le_TYPE.fit_transform(test_df['TYPE'])


# Create cyclical features
test_df['month_sin'] = np.sin(2 * np.pi * test_df['MONTH'] / 12)
test_df['month_cos'] = np.cos(2 * np.pi * test_df['MONTH'] / 12)




In [7]:

# Combine last few months from training with test
combined_df = pd.concat([train_df[['YEAR', 'MONTH', 'TYPE']], test_df], sort=False)

In [8]:
# Sort and group
combined_df.sort_values(['TYPE', 'YEAR', 'MONTH'], inplace=True)
combined_df['lag_1'] = combined_df.groupby('TYPE')['Incident_Counts'].shift(1)
combined_df['rolling_mean_3'] = combined_df.groupby('TYPE')['Incident_Counts'].shift(1).rolling(3).mean().reset_index(0, drop=True)


In [9]:

# Get the test rows again (they will have NaNs if they are too early in the series)
test_processed = combined_df[combined_df['Incident_Counts'].isna()].copy()


In [10]:
test_processed[['lag_1', 'rolling_mean_3']] = test_processed[['lag_1', 'rolling_mean_3']].fillna(0)


In [11]:
test_processed.tail()

Unnamed: 0,YEAR,MONTH,TYPE,Incident_Counts,TYPE_encoded,month_sin,month_cos,lag_1,rolling_mean_3
36,2013,2,Vehicle Collision or Pedestrian Struck (with I...,,8.0,0.8660254,0.5,0.0,0.0
27,2013,3,Vehicle Collision or Pedestrian Struck (with I...,,8.0,1.0,6.123234000000001e-17,0.0,0.0
18,2013,4,Vehicle Collision or Pedestrian Struck (with I...,,8.0,0.8660254,-0.5,0.0,0.0
9,2013,5,Vehicle Collision or Pedestrian Struck (with I...,,8.0,0.5,-0.8660254,0.0,0.0
0,2013,6,Vehicle Collision or Pedestrian Struck (with I...,,8.0,1.224647e-16,-1.0,0.0,0.0


In [12]:
print("Train columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())


Train columns: ['TYPE', 'HUNDRED_BLOCK', 'NEIGHBOURHOOD', 'X', 'Y', 'Latitude', 'Longitude', 'HOUR', 'MINUTE', 'YEAR', 'MONTH', 'DAY', 'Date']
Test columns: ['YEAR', 'MONTH', 'TYPE', 'Incident_Counts', 'TYPE_encoded', 'month_sin', 'month_cos']


In [13]:
def patch_test_df(test_df):
    patched_df = test_df.copy()

    # Add default values for missing columns
    if 'Latitude' not in patched_df.columns:
        patched_df['Latitude'] = 0.0
    if 'Longitude' not in patched_df.columns:
        patched_df['Longitude'] = 0.0
    if 'NEIGHBOURHOOD' not in patched_df.columns:
        patched_df['NEIGHBOURHOOD'] = 'Unknown'
    if 'TYPE' not in patched_df.columns:
        raise ValueError("Test data must include 'TYPE' column.")
    if 'YEAR' not in patched_df.columns or 'month' not in patched_df.columns:
        if 'Date' in patched_df.columns:
            patched_df['DATE'] = pd.to_datetime(patched_df['DATE'], errors='coerce')
            patched_df['YEAR'] = patched_df['DATE'].dt.year
            patched_df['MONTH'] = patched_df['DATE'].dt.month
        else:
            raise ValueError("Test data must have 'YEAR' and 'MONTH', or a 'DATE' column.")

    return patched_df


In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# STEP 1: Encode TYPE
le_TYPE = LabelEncoder()
train_df['TYPE_encoded'] = le_TYPE.fit_transform(train_df['TYPE'])

# STEP 2: Prepare Training Features
def build_training_features(df):
    df = df.copy()
    
    # Frequency encode TYPE
    crime_counts = df['TYPE'].value_counts()
    df['crime_type_freq'] = df['TYPE'].map(crime_counts)
    
    # KMeans cluster for coordinates
    coords = df[['Latitude', 'Longitude']].dropna()
    kmeans = KMeans(n_clusters=20, random_state=42).fit(coords)
    df['location_cluster'] = kmeans.predict(df[['Latitude', 'Longitude']].fillna(0))
    
    # Neighborhood crime count
    neighborhood_crime = df.groupby('NEIGHBOURHOOD').size().reset_index(name='nhood_crime_count')
    df = df.merge(neighborhood_crime, on='NEIGHBOURHOOD', how='left')
    
    # Monthly aggregation
    df_monthly = df.groupby(['YEAR', 'MONTH', 'TYPE']).size().reset_index(name='incident_count')
    df_monthly.sort_values(['TYPE', 'YEAR', 'MONTH'], inplace=True)
    df_monthly['lag_1'] = df_monthly.groupby('TYPE')['incident_count'].shift(1).fillna(0)
    df_monthly['rolling_mean_3'] = (
        df_monthly.groupby('TYPE')['incident_count']
        .shift(1).rolling(3).mean().reset_index(0, drop=True).fillna(0)
    )

    # Merge with encodings
    df = df.merge(df_monthly, on=['YEAR', 'MONTH', 'TYPE'], how='inner')
    df = df[['lag_1', 'rolling_mean_3', 'crime_type_freq', 'location_cluster', 'nhood_crime_count', 'TYPE_encoded', 'incident_count']]
    df = df.fillna(0)
    
    return df

Xy = build_training_features(train_df)
X_train = Xy.drop('incident_count', axis=1)
y_train = Xy['incident_count']

# STEP 3: Train the model
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# STEP 4: Use your function to predict on test data
def prepare_test_data_in_memory(train_df, test_df, le_TYPE, model):
    test_df = test_df.copy()
    test_df['TYPE_encoded'] = le_TYPE.transform(test_df['TYPE'])
    
    crime_counts = train_df['TYPE'].value_counts()
    test_df['crime_type_freq'] = test_df['TYPE'].map(crime_counts).fillna(crime_counts.min())
    
    if {'Latitude', 'Longitude'}.issubset(test_df.columns):
        coords = train_df[['Latitude', 'Longitude']].dropna()
        kmeans = KMeans(n_clusters=20, random_state=42).fit(coords)
        test_coords = test_df[['Latitude', 'Longitude']].fillna(0)
        test_df['location_cluster'] = kmeans.predict(test_coords)
    else:
        test_df['location_cluster'] = 0
    
    if 'NEIGHBOURHOOD' in test_df.columns:
        neighborhood_crime = train_df.groupby('NEIGHBOURHOOD').size().reset_index(name='nhood_crime_count')
        test_df = test_df.merge(neighborhood_crime, on='NEIGHBOURHOOD', how='left')
        test_df['nhood_crime_count'] = test_df['nhood_crime_count'].fillna(0)
    else:
        test_df['nhood_crime_count'] = 0

    train_monthly = train_df.groupby(['YEAR', 'MONTH', 'TYPE']).size().reset_index(name='incident_count')
    test_df['incident_count'] = np.nan

    combined = pd.concat([
        train_monthly,
        test_df[['YEAR', 'MONTH', 'TYPE', 'incident_count']]
    ], sort=False)

    combined.sort_values(['TYPE', 'YEAR', 'MONTH'], inplace=True)
    combined['lag_1'] = combined.groupby('TYPE')['incident_count'].shift(1)
    combined['rolling_mean_3'] = (
        combined.groupby('TYPE')['incident_count']
        .shift(1).rolling(3).mean().reset_index(0, drop=True)
    )

    enriched_test = combined[combined['incident_count'].isna()].copy()
    enriched_test = enriched_test.merge(test_df, on=['YEAR', 'MONTH', 'TYPE'], how='left')
    enriched_test['lag_1'] = enriched_test['lag_1'].fillna(0)
    enriched_test['rolling_mean_3'] = enriched_test['rolling_mean_3'].fillna(0)

    final_features = [
        'lag_1', 'rolling_mean_3', 'crime_type_freq',
        'location_cluster', 'nhood_crime_count', 'TYPE_encoded'
    ]
    enriched_test[final_features] = enriched_test[final_features].fillna(0)
    enriched_test['predicted_incident_count'] = model.predict(enriched_test[final_features])
    
    return enriched_test[['YEAR', 'MONTH', 'TYPE', 'predicted_incident_count']]

# STEP 5: Predict on test_df
result_df = prepare_test_data_in_memory(train_df, test_df, le_TYPE, model)
print(result_df.head())


   YEAR  MONTH                        TYPE  predicted_incident_count
0  2012      1  Break and Enter Commercial                224.976456
1  2012      2  Break and Enter Commercial                249.340000
2  2012      3  Break and Enter Commercial                249.340000
3  2012      4  Break and Enter Commercial                249.340000
4  2012      5  Break and Enter Commercial                249.340000


In [None]:
result_df = prepare_test_data_in_memory(train_df, test_df, le_TYPE, model)
print(result_df.head())


   YEAR  MONTH                        TYPE  predicted_incident_count
0  2012      1  Break and Enter Commercial                224.976456
1  2012      2  Break and Enter Commercial                249.340000
2  2012      3  Break and Enter Commercial                249.340000
3  2012      4  Break and Enter Commercial                249.340000
4  2012      5  Break and Enter Commercial                249.340000


In [None]:
result_df[['YEAR', 'MONTH', 'TYPE', 'predicted_incident_count']].to_csv("predictions.csv", index=False)
