In [1]:
# import pandas and upload a dataset to a dataframe

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading Data into df
df = pd.read_csv('cleanedcrimedata.csv')
df = df.drop(columns=['Unnamed: 0'])

# Convert 'Date Rptd' and 'DATE OCC' to datetime
df['Date Rptd'] = pd.to_datetime(df['Date Rptd'])
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])

print(df.dtypes)

DR_NO                    int64
Date Rptd       datetime64[ns]
DATE OCC        datetime64[ns]
TIME OCC                 int64
AREA                     int64
AREA NAME               object
Rpt Dist No              int64
Part 1-2                 int64
Crm Cd                   int64
Crm Cd Desc             object
Mocodes                 object
Vict Age                 int64
Vict Sex                object
Vict Descent            object
Premis Cd              float64
Premis Desc             object
Status                  object
Status Desc             object
Crm Cd 1               float64
LOCATION                object
LAT                    float64
LON                    float64
dtype: object


In [3]:
top_25_value_counts = df['Crm Cd Desc'].value_counts().head(25)

print(top_25_value_counts)

VEHICLE - STOLEN                                            102886
BATTERY - SIMPLE ASSAULT                                     74541
BURGLARY FROM VEHICLE                                        58578
THEFT OF IDENTITY                                            58518
BURGLARY                                                     57527
VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)      57439
ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT               53208
THEFT PLAIN - PETTY ($950 & UNDER)                           48464
INTIMATE PARTNER - SIMPLE ASSAULT                            46650
THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)              36887
THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)          34042
ROBBERY                                                      31982
THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD     31840
SHOPLIFTING - PETTY THEFT ($950 & UNDER)                     25817
VANDALISM - MISDEAMEANOR ($399 OR UNDER)                     2

In [4]:
# Only using values from 'THEFT OF IDENTITY'
df_other_crime = df[df['Crm Cd Desc'] == 'OTHER MISCELLANEOUS CRIME']

# Drop the 'Crm Cd Desc' column
df_other_crime

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Vict Sex,Vict Descent,Premis Cd,Premis Desc,Status,Status Desc,Crm Cd 1,LOCATION,LAT,LON
144,231806653,2022-06-12,2020-03-04,800,18,Southeast,1832,2,946,OTHER MISCELLANEOUS CRIME,...,X,X,726.0,POLICE FACILITY,IC,Invest Cont,946.0,100 W 108TH ST,33.9401,-118.2761
201,220404852,2022-01-28,2020-12-27,1200,4,Hollenbeck,439,2,946,OTHER MISCELLANEOUS CRIME,...,M,H,501.0,SINGLE FAMILY DWELLING,IC,Invest Cont,946.0,2200 LILLYVALE AV,34.0711,-118.1643
216,231506571,2023-03-03,2020-07-15,1700,15,N Hollywood,1532,2,946,OTHER MISCELLANEOUS CRIME,...,F,W,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",IC,Invest Cont,946.0,12300 EMELITA ST,34.1763,-118.4013
300,220319193,2022-10-27,2020-11-12,1500,3,Southwest,393,2,946,OTHER MISCELLANEOUS CRIME,...,M,B,501.0,SINGLE FAMILY DWELLING,IC,Invest Cont,946.0,4100 GARTHWAITE AV,34.0094,-118.3231
301,231110558,2023-06-27,2020-08-07,1400,11,Northeast,1149,2,946,OTHER MISCELLANEOUS CRIME,...,M,B,710.0,OTHER PREMISE,IC,Invest Cont,946.0,200 N AVE 55,34.1094,-118.1963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947197,241207782,2024-03-04,2024-03-02,2100,12,77th Street,1232,2,946,OTHER MISCELLANEOUS CRIME,...,M,W,501.0,SINGLE FAMILY DWELLING,IC,Invest Cont,946.0,5800 CRENSHAW BL,33.9890,-118.3308
947380,242004060,2024-01-03,2024-01-03,1525,20,Olympic,2062,2,946,OTHER MISCELLANEOUS CRIME,...,X,X,101.0,STREET,IC,Invest Cont,946.0,10TH ST,34.0532,-118.3145
947390,241500548,2024-02-02,2024-02-02,335,15,N Hollywood,1566,2,946,OTHER MISCELLANEOUS CRIME,...,X,X,101.0,STREET,IC,Invest Cont,946.0,MOORPARK ST,34.1504,-118.3704
947590,241906434,2024-03-04,2024-01-08,2300,19,Mission,1982,2,946,OTHER MISCELLANEOUS CRIME,...,F,H,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",IC,Invest Cont,946.0,15100 PARTHENIA ST,34.2301,-118.4633


In [5]:
# Assuming df_other_crime is your DataFrame
df_other_crime_cleaned = df_other_crime.dropna()

# Print the cleaned DataFrame (optional)
print(df_other_crime_cleaned)

            DR_NO  Date Rptd   DATE OCC  TIME OCC  AREA    AREA NAME  \
144     231806653 2022-06-12 2020-03-04       800    18    Southeast   
201     220404852 2022-01-28 2020-12-27      1200     4   Hollenbeck   
216     231506571 2023-03-03 2020-07-15      1700    15  N Hollywood   
300     220319193 2022-10-27 2020-11-12      1500     3    Southwest   
478     230618170 2023-11-20 2020-01-17      1500     6    Hollywood   
...           ...        ...        ...       ...   ...          ...   
947172  241005225 2024-02-02 2024-02-02      1800    10  West Valley   
947380  242004060 2024-01-03 2024-01-03      1525    20      Olympic   
947390  241500548 2024-02-02 2024-02-02       335    15  N Hollywood   
947590  241906434 2024-03-04 2024-01-08      2300    19      Mission   
947651  241605322 2024-02-13 2024-01-10      1600    16     Foothill   

        Rpt Dist No  Part 1-2  Crm Cd                Crm Cd Desc  ...  \
144            1832         2     946  OTHER MISCELLANEOUS CRI

In [6]:
### LINEAR REGRESSION TEST


from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming df_other_crime_cleaned is your DataFrame
# Selecting features and target variables
features = ['DR_NO', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 
            'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Status', 'Status Desc', 'Crm Cd 1', 'LOCATION']

X = df_other_crime_cleaned[features]
y_lat = df_other_crime_cleaned['LAT']
y_lon = df_other_crime_cleaned['LON']

# Splitting the data into training and testing sets (80:20 split)
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Preprocessing: One-hot encoding for categorical variables to make things easier!
categorical_features = ['AREA NAME', 'Crm Cd Desc', 'Mocodes', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']
numeric_features = ['DR_NO', 'TIME OCC', 'AREA', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age', 'Premis Cd', 'Crm Cd 1']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Creating the linear regression pipeline
model_lat_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', LinearRegression())])

model_lon_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', LinearRegression())])

# Training the linear regression model for LAT
model_lat_pipeline.fit(X_train, y_lat_train)

# Training the linear regression model for LON
model_lon_pipeline.fit(X_train, y_lon_train)

# Predicting and evaluating for LAT
y_lat_train_pred = model_lat_pipeline.predict(X_train)
y_lat_test_pred = model_lat_pipeline.predict(X_test)

r2_lat_train = r2_score(y_lat_train, y_lat_train_pred)
r2_lat_test = r2_score(y_lat_test, y_lat_test_pred)
mse_lat_train = mean_squared_error(y_lat_train, y_lat_train_pred)
mse_lat_test = mean_squared_error(y_lat_test, y_lat_test_pred)

# Predicting and evaluating for LON
y_lon_train_pred = model_lon_pipeline.predict(X_train)
y_lon_test_pred = model_lon_pipeline.predict(X_test)

r2_lon_train = r2_score(y_lon_train, y_lon_train_pred)
r2_lon_test = r2_score(y_lon_test, y_lon_test_pred)
mse_lon_train = mean_squared_error(y_lon_train, y_lon_train_pred)
mse_lon_test = mean_squared_error(y_lon_test, y_lon_test_pred)

# Printing the results
print("LAT Model:")
print(f"Training R²: {r2_lat_train:.4f}, Training MSE: {mse_lat_train:.4f}")
print(f"Testing R²: {r2_lat_test:.4f}, Testing MSE: {mse_lat_test:.4f}")

print("\nLON Model:")
print(f"Training R²: {r2_lon_train:.4f}, Training MSE: {mse_lon_train:.4f}")
print(f"Testing R²: {r2_lon_test:.4f}, Testing MSE: {mse_lon_test:.4f}")


LAT Model:
Training R²: 0.0104, Training MSE: 13.7739
Testing R²: 0.0016, Testing MSE: 10.0516

LON Model:
Training R²: 0.0102, Training MSE: 166.0660
Testing R²: 0.0017, Testing MSE: 121.1432


In [7]:
# POLYNOMIAL REGRESSION TEST (2nd degree)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Selecting features and target variables
features = ['DR_NO', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 
            'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']
features = [feature for feature in features if feature in df_other_crime_cleaned.columns]

X = df_other_crime_cleaned[features]
y_lat = df_other_crime_cleaned['LAT']
y_lon = df_other_crime_cleaned['LON']

# Splitting the data into training and testing sets (80:20 split)
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Preprocessing: One-hot encoding for categorical variables
categorical_features = ['AREA NAME', 'Crm Cd Desc', 'Mocodes', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

numeric_features = ['DR_NO', 'TIME OCC', 'AREA', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age', 'Premis Cd']
numeric_features = [feature for feature in numeric_features if feature in X_train.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Polynomial features
polynomial_features = PolynomialFeatures(degree=2, include_bias=False)

# Creating the polynomial regression pipeline
model_lat_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polynomial', polynomial_features),
    ('regressor', LinearRegression())
])

model_lon_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polynomial', polynomial_features),
    ('regressor', LinearRegression())
])

# Training the polynomial regression model for LAT
model_lat_pipeline.fit(X_train, y_lat_train)

# Training the polynomial regression model for LON
model_lon_pipeline.fit(X_train, y_lon_train)

# Predicting and evaluating for LAT
y_lat_train_pred = model_lat_pipeline.predict(X_train)
y_lat_test_pred = model_lat_pipeline.predict(X_test)

r2_lat_train = r2_score(y_lat_train, y_lat_train_pred)
r2_lat_test = r2_score(y_lat_test, y_lat_test_pred)
mse_lat_train = mean_squared_error(y_lat_train, y_lat_train_pred)
mse_lat_test = mean_squared_error(y_lat_test, y_lat_test_pred)

# Predicting and evaluating for LON
y_lon_train_pred = model_lon_pipeline.predict(X_train)
y_lon_test_pred = model_lon_pipeline.predict(X_test)

r2_lon_train = r2_score(y_lon_train, y_lon_train_pred)
r2_lon_test = r2_score(y_lon_test, y_lon_test_pred)
mse_lon_train = mean_squared_error(y_lon_train, y_lon_train_pred)
mse_lon_test = mean_squared_error(y_lon_test, y_lon_test_pred)

# Printing the results
print("LAT Model:")
print(f"Training R²: {r2_lat_train:.4f}, Training MSE: {mse_lat_train:.4f}")
print(f"Testing R²: {r2_lat_test:.4f}, Testing MSE: {mse_lat_test:.4f}")

print("\nLON Model:")
print(f"Training R²: {r2_lon_train:.4f}, Training MSE: {mse_lon_train:.4f}")
print(f"Testing R²: {r2_lon_test:.4f}, Testing MSE: {mse_lon_test:.4f}")


LAT Model:
Training R²: 0.0103, Training MSE: 13.7749
Testing R²: 0.0018, Testing MSE: 10.0503

LON Model:
Training R²: 0.0101, Training MSE: 166.0785
Testing R²: 0.0018, Testing MSE: 121.1280


In [8]:
# POLYNOMIAL REGRESSION TEST (5th degree)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Selecting features and target variables
features = ['DR_NO', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 
            'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']
features = [feature for feature in features if feature in df_other_crime_cleaned.columns]

X = df_other_crime_cleaned[features]
y_lat = df_other_crime_cleaned['LAT']
y_lon = df_other_crime_cleaned['LON']

# Splitting the data into training and testing sets (80:20 split)!
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Preprocessing: One-hot encoding for categorical variables
categorical_features = ['AREA NAME', 'Crm Cd Desc', 'Mocodes', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

numeric_features = ['DR_NO', 'TIME OCC', 'AREA', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age', 'Premis Cd']
numeric_features = [feature for feature in numeric_features if feature in X_train.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Polynomial features
polynomial_features = PolynomialFeatures(degree=5, include_bias=False)

# Creating the polynomial regression pipeline
model_lat_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polynomial', polynomial_features),
    ('regressor', LinearRegression())
])

model_lon_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polynomial', polynomial_features),
    ('regressor', LinearRegression())
])

# Training the polynomial regression model for LAT
model_lat_pipeline.fit(X_train, y_lat_train)

# Training the polynomial regression model for LON
model_lon_pipeline.fit(X_train, y_lon_train)

# Predicting and evaluating for LAT
y_lat_train_pred = model_lat_pipeline.predict(X_train)
y_lat_test_pred = model_lat_pipeline.predict(X_test)

r2_lat_train = r2_score(y_lat_train, y_lat_train_pred)
r2_lat_test = r2_score(y_lat_test, y_lat_test_pred)
mse_lat_train = mean_squared_error(y_lat_train, y_lat_train_pred)
mse_lat_test = mean_squared_error(y_lat_test, y_lat_test_pred)

# Predicting and evaluating for LON
y_lon_train_pred = model_lon_pipeline.predict(X_train)
y_lon_test_pred = model_lon_pipeline.predict(X_test)

r2_lon_train = r2_score(y_lon_train, y_lon_train_pred)
r2_lon_test = r2_score(y_lon_test, y_lon_test_pred)
mse_lon_train = mean_squared_error(y_lon_train, y_lon_train_pred)
mse_lon_test = mean_squared_error(y_lon_test, y_lon_test_pred)

# Printing the results
print("LAT Model:")
print(f"Training R²: {r2_lat_train:.4f}, Training MSE: {mse_lat_train:.4f}")
print(f"Testing R²: {r2_lat_test:.4f}, Testing MSE: {mse_lat_test:.4f}")

print("\nLON Model:")
print(f"Training R²: {r2_lon_train:.4f}, Training MSE: {mse_lon_train:.4f}")
print(f"Testing R²: {r2_lon_test:.4f}, Testing MSE: {mse_lon_test:.4f}")


KeyboardInterrupt: 

In [9]:
# Gradient Boosting Regressor

from sklearn.ensemble import GradientBoostingRegressor

# Selecting features and target variables
features = ['DR_NO', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 
            'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']

# Ensure we only use existing columns
features = [feature for feature in features if feature in df_other_crime_cleaned.columns]

X = df_other_crime_cleaned[features]
y_lat = df_other_crime_cleaned['LAT']
y_lon = df_other_crime_cleaned['LON']

# Splitting the data into training and testing sets (80:20 split)
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Preprocessing: One-hot encoding for categorical variables
categorical_features = ['AREA NAME', 'Crm Cd Desc', 'Mocodes', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

numeric_features = ['DR_NO', 'TIME OCC', 'AREA', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age', 'Premis Cd']
numeric_features = [feature for feature in numeric_features if feature in X_train.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Creating the gradient boosting regression pipeline
model_lat_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

model_lon_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

# Training the gradient boosting regression model for LAT
model_lat_pipeline.fit(X_train, y_lat_train)

# Training the gradient boosting regression model for LON
model_lon_pipeline.fit(X_train, y_lon_train)

# Predicting and evaluating for LAT
y_lat_train_pred = model_lat_pipeline.predict(X_train)
y_lat_test_pred = model_lat_pipeline.predict(X_test)

r2_lat_train = r2_score(y_lat_train, y_lat_train_pred)
r2_lat_test = r2_score(y_lat_test, y_lat_test_pred)
mse_lat_train = mean_squared_error(y_lat_train, y_lat_train_pred)
mse_lat_test = mean_squared_error(y_lat_test, y_lat_test_pred)

# Predicting and evaluating for LON
y_lon_train_pred = model_lon_pipeline.predict(X_train)
y_lon_test_pred = model_lon_pipeline.predict(X_test)

r2_lon_train = r2_score(y_lon_train, y_lon_train_pred)
r2_lon_test = r2_score(y_lon_test, y_lon_test_pred)
mse_lon_train = mean_squared_error(y_lon_train, y_lon_train_pred)
mse_lon_test = mean_squared_error(y_lon_test, y_lon_test_pred)

# Printing the results
print("LAT Model:")
print(f"Training R²: {r2_lat_train:.4f}, Training MSE: {mse_lat_train:.4f}")
print(f"Testing R²: {r2_lat_test:.4f}, Testing MSE: {mse_lat_test:.4f}")

print("\nLON Model:")
print(f"Training R²: {r2_lon_train:.4f}, Training MSE: {mse_lon_train:.4f}")
print(f"Testing R²: {r2_lon_test:.4f}, Testing MSE: {mse_lon_test:.4f}")

LAT Model:
Training R²: 0.5740, Training MSE: 5.9293
Testing R²: -0.1270, Testing MSE: 11.3467

LON Model:
Training R²: 0.5773, Training MSE: 70.9270
Testing R²: -0.1264, Testing MSE: 136.6768


In [10]:
# Random Forest Regressor


from sklearn.ensemble import RandomForestRegressor

# Selecting features and target variables
features = ['DR_NO', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 
            'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']

# Ensure we only use existing columns
features = [feature for feature in features if feature in df_other_crime_cleaned.columns]

X = df_other_crime_cleaned[features]
y_lat = df_other_crime_cleaned['LAT']
y_lon = df_other_crime_cleaned['LON']

# Splitting the data into training and testing sets (80:20 split)
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Preprocessing: One-hot encoding for categorical variables
categorical_features = ['AREA NAME', 'Crm Cd Desc', 'Mocodes', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

numeric_features = ['DR_NO', 'TIME OCC', 'AREA', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age', 'Premis Cd']
numeric_features = [feature for feature in numeric_features if feature in X_train.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Creating the random forest regression pipeline
model_lat_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

model_lon_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Training the random forest regression model for LAT
model_lat_pipeline.fit(X_train, y_lat_train)

# Training the random forest regression model for LON
model_lon_pipeline.fit(X_train, y_lon_train)

# Predicting and evaluating for LAT
y_lat_train_pred = model_lat_pipeline.predict(X_train)
y_lat_test_pred = model_lat_pipeline.predict(X_test)

r2_lat_train = r2_score(y_lat_train, y_lat_train_pred)
r2_lat_test = r2_score(y_lat_test, y_lat_test_pred)
mse_lat_train = mean_squared_error(y_lat_train, y_lat_train_pred)
mse_lat_test = mean_squared_error(y_lat_test, y_lat_test_pred)

# Predicting and evaluating for LON
y_lon_train_pred = model_lon_pipeline.predict(X_train)
y_lon_test_pred = model_lon_pipeline.predict(X_test)

r2_lon_train = r2_score(y_lon_train, y_lon_train_pred)
r2_lon_test = r2_score(y_lon_test, y_lon_test_pred)
mse_lon_train = mean_squared_error(y_lon_train, y_lon_train_pred)
mse_lon_test = mean_squared_error(y_lon_test, y_lon_test_pred)

# Printing the results
print("LAT Model:")
print(f"Training R²: {r2_lat_train:.4f}, Training MSE: {mse_lat_train:.4f}")
print(f"Testing R²: {r2_lat_test:.4f}, Testing MSE: {mse_lat_test:.4f}")

print("\nLON Model:")
print(f"Training R²: {r2_lon_train:.4f}, Training MSE: {mse_lon_train:.4f}")
print(f"Testing R²: {r2_lon_test:.4f}, Testing MSE: {mse_lon_test:.4f}")

LAT Model:
Training R²: 0.8428, Training MSE: 2.1887
Testing R²: -0.1605, Testing MSE: 11.6837

LON Model:
Training R²: 0.8482, Training MSE: 25.4615
Testing R²: -0.1412, Testing MSE: 138.4784


In [11]:
# Decision Tree Regressor


from sklearn.tree import DecisionTreeRegressor

# Selecting features and target variables
features = ['DR_NO', 'TIME OCC', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 
            'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']

# Ensure we only use existing columns
features = [feature for feature in features if feature in df_other_crime_cleaned.columns]

X = df_other_crime_cleaned[features]
y_lat = df_other_crime_cleaned['LAT']
y_lon = df_other_crime_cleaned['LON']

# Splitting the data into training and testing sets (80:20 split)
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Preprocessing: One-hot encoding for categorical variables
categorical_features = ['AREA NAME', 'Crm Cd Desc', 'Mocodes', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Status', 'Status Desc', 'LOCATION']
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

numeric_features = ['DR_NO', 'TIME OCC', 'AREA', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age', 'Premis Cd']
numeric_features = [feature for feature in numeric_features if feature in X_train.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Creating the decision tree regression pipeline
model_lat_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor())
])

model_lon_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor())
])

# Training the decision tree regression model for LAT
model_lat_pipeline.fit(X_train, y_lat_train)

# Training the decision tree regression model for LON
model_lon_pipeline.fit(X_train, y_lon_train)

# Predicting and evaluating for LAT
y_lat_train_pred = model_lat_pipeline.predict(X_train)
y_lat_test_pred = model_lat_pipeline.predict(X_test)

r2_lat_train = r2_score(y_lat_train, y_lat_train_pred)
r2_lat_test = r2_score(y_lat_test, y_lat_test_pred)
mse_lat_train = mean_squared_error(y_lat_train, y_lat_train_pred)
mse_lat_test = mean_squared_error(y_lat_test, y_lat_test_pred)

# Predicting and evaluating for LON
y_lon_train_pred = model_lon_pipeline.predict(X_train)
y_lon_test_pred = model_lon_pipeline.predict(X_test)

r2_lon_train = r2_score(y_lon_train, y_lon_train_pred)
r2_lon_test = r2_score(y_lon_test, y_lon_test_pred)
mse_lon_train = mean_squared_error(y_lon_train, y_lon_train_pred)
mse_lon_test = mean_squared_error(y_lon_test, y_lon_test_pred)

# Printing the results
print("LAT Model:")
print(f"Training R²: {r2_lat_train:.4f}, Training MSE: {mse_lat_train:.4f}")
print(f"Testing R²: {r2_lat_test:.4f}, Testing MSE: {mse_lat_test:.4f}")

print("\nLON Model:")
print(f"Training R²: {r2_lon_train:.4f}, Training MSE: {mse_lon_train:.4f}")
print(f"Testing R²: {r2_lon_test:.4f}, Testing MSE: {mse_lon_test:.4f}")


LAT Model:
Training R²: 1.0000, Training MSE: 0.0000
Testing R²: -0.9199, Testing MSE: 19.3293

LON Model:
Training R²: 1.0000, Training MSE: 0.0000
Testing R²: -1.0162, Testing MSE: 244.6556
