In [26]:
pip install xgboost scikit-learn pandas joblib

Note: you may need to restart the kernel to use updated packages.


In [65]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import joblib

# Load crime data
crime_df = pd.read_csv("archive-1/crime_60_100.csv")

# Ensure the state column is named 'states' and uses full names
# Clean numeric columns
numeric_cols = ['violent_crime', 'murder', 'rape', 'robbery', 'population']
for col in numeric_cols:
    crime_df[col] = crime_df[col].astype(str).str.replace(',', '')
    crime_df[col] = pd.to_numeric(crime_df[col], errors='coerce')
crime_df = crime_df.dropna(subset=numeric_cols)

# Check if 'violent_crime' is a total to avoid double-counting
# Assuming 'violent_crime' is the total, use only that for crime rate
crime_df['crime_rate'] = (crime_df['violent_crime'] / crime_df['population']) * 1000

# State mapping
state_abbr_to_name = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont',
    'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin',
    'WY': 'Wyoming'
}


# Load ZIP data
zips = pd.read_csv("uszips.csv")[['zip', 'state_id', 'population']]
zips = zips.rename(columns={'zip': 'zip_code', 'state_id': 'state_abbr'})
zips['state_name'] = zips['state_abbr'].map(state_abbr_to_name)

merged = zips.merge(
    crime_df.rename(columns={'population': 'state_population'}),  # Rename conflict column
    left_on='state_name', 
    right_on='states', 
    how='left'
)

# Handle missing crime data using state average
state_avg = merged.groupby('state_abbr')['crime_rate'].transform('mean')
merged['crime_rate'] = merged['crime_rate'].fillna(state_avg)

# If no state average available, fall back to national average
merged['crime_rate'] = merged['crime_rate'].fillna(merged['crime_rate'].mean())

# Feature engineering - use ZIP's population (now called 'population' from zips DataFrame)
X = merged[['state_abbr', 'population']]  # Now correctly references zips population
y = merged['crime_rate']
# Preprocessing
preprocessor = ColumnTransformer([
    ('state', OneHotEncoder(handle_unknown='ignore'), ['state_abbr'])
], remainder='passthrough')

X_processed = preprocessor.fit_transform(X)

# Train model
model = XGBRegressor(n_estimators=100, max_depth=3, random_state=42)
model.fit(X_processed, y)

# Save artifacts
joblib.dump(model, 'crime_model.pkl')
joblib.dump(preprocessor, 'crime_preprocessor.pkl')  # Uncommented line

print("successful run the prediction cell.")

successful run the prediction cell.


In [63]:
def predict_crime_rate(zip_code):
    try:
        model = joblib.load('crime_model.pkl')
        preprocessor = joblib.load('crime_preprocessor.pkl')
        zip_data = zips[zips['zip_code'] == int(zip_code)].iloc[0]
        
        input_data = pd.DataFrame([{
            'state_abbr': zip_data['state_abbr'],
            'population': zip_data['population']
        }])
        
        processed = preprocessor.transform(input_data)
        rate = model.predict(processed)[0]
        
        print(f"ZIP {zip_code} crime rate: {rate:.2f} per 1000")
        return rate
        
    except IndexError:
        print(f"ZIP {zip_code} not found in database.")
        return None
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Test prediction
predict_crime_rate(75105)

ZIP 75105 crime rate: 4.80 per 1000


4.8034096