In [None]:
!pip install -q pandas lightgbm matplotlib joblib holidays
print("Libraries Installed.")

from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import pandas as pd
import holidays
import joblib

print("Libraries loaded.")

In [None]:
# Load Data
df = pd.read_csv("combined_data.csv")
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

# Extract City Stats (1 row per City/Year)
stats_cols = ['population', 'total_officers', 'officers_per_1000_people', 'crime_rate_per_1000_people']
city_stats = df[['city', 'year'] + stats_cols].drop_duplicates(subset=['city', 'year'])
city_stats = city_stats.groupby(['city', 'year']).max().reset_index()

# Aggregate Daily Crimes
daily_crimes = df.groupby(['date', 'city']).size().reset_index(name='crime_count')

# Fill Missing Days with 0s
all_dates = pd.date_range(start=daily_crimes['date'].min(), end=daily_crimes['date'].max(), freq='D')
all_cities = daily_crimes['city'].unique()
idx = pd.MultiIndex.from_product([all_dates, all_cities], names=['date', 'city'])
daily_crimes = daily_crimes.set_index(['date', 'city']).reindex(idx, fill_value=0).reset_index()

# Merge Stats
daily_crimes['year'] = daily_crimes['date'].dt.year
daily_crimes = daily_crimes.merge(city_stats, on=['city', 'year'], how='left')
daily_crimes[stats_cols] = daily_crimes.groupby('city')[stats_cols].ffill().bfill()

print(f"Data ready. Total rows: {len(daily_crimes)}")

In [None]:
def create_features(data):
    data = data.copy()
    
    # Memory (Lags)
    data['lag_1'] = data.groupby('city')['crime_count'].shift(1)
    data['lag_7'] = data.groupby('city')['crime_count'].shift(7)
    data['roll_mean_7'] = data.groupby('city')['crime_count'].transform(lambda x: x.rolling(7).mean())
    
    # Calendar
    data['day_of_week'] = data['date'].dt.dayofweek
    data['month'] = data['date'].dt.month
    
    # Holidays
    ct_holidays = holidays.US(state='CT')
    data['is_holiday'] = data['date'].apply(lambda x: 1 if x in ct_holidays else 0)
    
    return data

# Create features and drop NaN rows (first 7 days)
df_train = create_features(daily_crimes).dropna()
print("Features created.")

In [None]:
print("Training Forecaster (LightGBM Regressor)...")

features = [
    'lag_1', 'lag_7', 'roll_mean_7', 
    'day_of_week', 'month', 'is_holiday',
    'population', 'total_officers', 'officers_per_1000_people', 'crime_rate_per_1000_people'
]

X_reg = df_train[features].copy()
y_reg = df_train['crime_count']

# Handle Categories
X_reg['city'] = df_train['city'].astype('category')

# Train
forecaster = lgb.LGBMRegressor(n_estimators=150, learning_rate=0.05, random_state=42, verbosity=-1)
forecaster.fit(X_reg, y_reg, categorical_feature=['city'])

print("Forecaster trained.")

In [None]:
print("Training Classifier (LightGBM Classifier)...")

# Prepare Inputs
X_clf = df[['city', 'location_area', 'hour', 'dayofweek', 'month']].copy()

# Convert all inputs to categorical
for col in X_clf.columns:
    X_clf[col] = X_clf[col].astype('category')

# Prepare Target
y_clf = df['offense_category_name']
le = LabelEncoder()
y_encoded = le.fit_transform(y_clf)

# Train
classifier = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, verbosity=-1)
classifier.fit(X_clf, y_encoded)

print("Classifier trained.")

In [None]:
print("Saving models...")

joblib.dump(forecaster, 'crime_forecaster.pkl')
joblib.dump(classifier, 'crime_classifier.pkl')
joblib.dump(le, 'label_encoder.pkl')

print("âœ… All models saved! You can now run app.py.")