In [1]:
# import pandas and upload a dataset to a dataframe

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading Data into df
df = pd.read_csv('cleanedcrimedata.csv')
df = df.drop(columns=['Unnamed: 0', 'Date Rptd', 'DATE OCC', 'Mocodes', 'Vict Sex', 'Vict Descent'])

print(df.dtypes)

DR_NO            int64
TIME OCC         int64
AREA             int64
AREA NAME       object
Rpt Dist No      int64
Part 1-2         int64
Crm Cd           int64
Crm Cd Desc     object
Vict Age         int64
Premis Cd      float64
Premis Desc     object
Status          object
Status Desc     object
Crm Cd 1       float64
LOCATION        object
LAT            float64
LON            float64
dtype: object


In [3]:
top_25_value_counts = df['Crm Cd Desc'].value_counts().head(25)

print(top_25_value_counts)

VEHICLE - STOLEN                                            102886
BATTERY - SIMPLE ASSAULT                                     74541
BURGLARY FROM VEHICLE                                        58578
THEFT OF IDENTITY                                            58518
BURGLARY                                                     57527
VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)      57439
ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT               53208
THEFT PLAIN - PETTY ($950 & UNDER)                           48464
INTIMATE PARTNER - SIMPLE ASSAULT                            46650
THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)              36887
THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)          34042
ROBBERY                                                      31982
THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD     31840
SHOPLIFTING - PETTY THEFT ($950 & UNDER)                     25817
VANDALISM - MISDEAMEANOR ($399 OR UNDER)                     2

In [4]:
# Filter for 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT'
df_assault = df[df['Crm Cd Desc'] == 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT']

# Count NaN values in each column
nan_counts = df_assault.isna().sum()
print(nan_counts)

# Drop rows with any NaN values
df_assault = df_assault.dropna()

# Drop the 'Crm Cd Desc' column
df_assault = df_assault.drop(columns=['Crm Cd Desc'])

# Display the cleaned DataFrame
print(df_assault)

DR_NO          0
TIME OCC       0
AREA           0
AREA NAME      0
Rpt Dist No    0
Part 1-2       0
Crm Cd         0
Crm Cd Desc    0
Vict Age       0
Premis Cd      2
Premis Desc    2
Status         0
Status Desc    0
Crm Cd 1       0
LOCATION       0
LAT            0
LON            0
dtype: int64


Unnamed: 0,DR_NO,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,Vict Age,Premis Cd,Premis Desc,Status,Status Desc,Crm Cd 1,LOCATION,LAT,LON
0,190326475,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,0,101.0,STREET,AA,Adult Arrest,510.0,1900 S LONGWOOD AV,34.0375,-118.3506
13,221008844,130,10,West Valley,1029,1,510,VEHICLE - STOLEN,0,101.0,STREET,IC,Invest Cont,510.0,VALJEAN ST,34.1939,-118.4859
24,200412582,630,4,Hollenbeck,413,1,510,VEHICLE - STOLEN,0,101.0,STREET,IC,Invest Cont,510.0,200 E AVENUE 28,34.0820,-118.2130
81,201810154,1900,18,Southeast,1802,1,510,VEHICLE - STOLEN,0,101.0,STREET,IC,Invest Cont,510.0,90TH,33.9547,-118.2717
94,231510293,200,15,N Hollywood,1504,1,510,VEHICLE - STOLEN,0,101.0,STREET,IC,Invest Cont,510.0,7500 LAUREL CANYON BL,34.2071,-118.3965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947744,241509353,1700,15,N Hollywood,1596,1,510,VEHICLE - STOLEN,0,101.0,STREET,IC,Invest Cont,510.0,3700 BERRY DR,34.1346,-118.3759
947749,240310624,430,3,Southwest,315,1,510,VEHICLE - STOLEN,0,101.0,STREET,IC,Invest Cont,510.0,2200 W 24TH ST,34.0349,-118.3148
947757,241404089,345,14,Pacific,1427,1,510,VEHICLE - STOLEN,0,101.0,STREET,IC,Invest Cont,510.0,3400 S SEPULVEDA BL,34.0212,-118.4238
947758,241304056,2100,13,Newton,1347,1,510,VEHICLE - STOLEN,0,101.0,STREET,IC,Invest Cont,510.0,41ST,34.0072,-118.2432


In [5]:
# Linear Regression

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 10,000 Values Samples
df_assault = df_assault.sample(n=10000, random_state=42)

# Select features and target variables
X = df_assault.drop(columns=['LAT', 'LON'])
y_lat = df_assault['LAT']
y_lon = df_assault['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Initialize lists to take averages easier later on
lat_train_r2_list = []
lat_test_r2_list = []
lat_train_mse_list = []
lat_test_mse_list = []

lon_train_r2_list = []
lon_test_r2_list = []
lon_train_mse_list = []
lon_test_mse_list = []

# Run the model 5 times
for i in range(5):
    X_train, X_test, y_lat_train, y_lat_test = train_test_split(X, y_lat, test_size=0.2, random_state=i)
    X_train, X_test, y_lon_train, y_lon_test = train_test_split(X, y_lon, test_size=0.2, random_state=i)

    # Initialize and train the model for LAT
    model_lat = LinearRegression()
    model_lat.fit(X_train, y_lat_train)
    y_lat_pred_train = model_lat.predict(X_train)
    y_lat_pred_test = model_lat.predict(X_test)

    # Initialize and train the model for LON
    model_lon = LinearRegression()
    model_lon.fit(X_train, y_lon_train)
    y_lon_pred_train = model_lon.predict(X_train)
    y_lon_pred_test = model_lon.predict(X_test)

    # Evaluate the model for LAT
    lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
    lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
    lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
    lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

    # Evaluate the model for LON
    lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
    lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
    lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
    lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

    # Append the metrics to the lists
    lat_train_r2_list.append(lat_train_r2)
    lat_test_r2_list.append(lat_test_r2)
    lat_train_mse_list.append(lat_train_mse)
    lat_test_mse_list.append(lat_test_mse)

    lon_train_r2_list.append(lon_train_r2)
    lon_test_r2_list.append(lon_test_r2)
    lon_train_mse_list.append(lon_train_mse)
    lon_test_mse_list.append(lon_test_mse)

    # Print the results
    print(f"Run {i+1}")
    print(f"LAT:")
    print(f"  Train R^2: {lat_train_r2:.3f}")
    print(f"  Test R^2: {lat_test_r2:.3f}")
    print(f"  Train MSE: {lat_train_mse:.3f}")
    print(f"  Test MSE: {lat_test_mse:.3f}")

    print(f"LON:")
    print(f"  Train R^2: {lon_train_r2:.3f}")
    print(f"  Test R^2: {lon_test_r2:.3f}")
    print(f"  Train MSE: {lon_train_mse:.3f}")
    print(f"  Test MSE: {lon_test_mse:.3f}")
    print("-" * 40)

# Calculate and print averages of 5 runs
average_lat_train_r2 = np.mean(lat_train_r2_list)
average_lat_test_r2 = np.mean(lat_test_r2_list)
average_lat_train_mse = np.mean(lat_train_mse_list)
average_lat_test_mse = np.mean(lat_test_mse_list)

average_lon_train_r2 = np.mean(lon_train_r2_list)
average_lon_test_r2 = np.mean(lon_test_r2_list)
average_lon_train_mse = np.mean(lon_train_mse_list)
average_lon_test_mse = np.mean(lon_test_mse_list)

print("Average Results")
print(f"LAT:")
print(f"  Train R^2: {average_lat_train_r2:.3f}")
print(f"  Test R^2: {average_lat_test_r2:.3f}")
print(f"  Train MSE: {average_lat_train_mse:.3f}")
print(f"  Test MSE: {average_lat_test_mse:.3f}")

print(f"LON:")
print(f"  Train R^2: {average_lon_train_r2:.3f}")
print(f"  Test R^2: {average_lon_test_r2:.3f}")
print(f"  Train MSE: {average_lon_train_mse:.3f}")
print(f"  Test MSE: {average_lon_test_mse:.3f}")


LAT:
  Train R^2: 0.7384995652039659
  Test R^2: -0.08795125014351557
  Train MSE: 0.19318852110347748
  Test MSE: 1.2762780257373831
LON:
  Train R^2: 0.7333726380669752
  Test R^2: -0.10685590401031186
  Train MSE: 2.335045787513088
  Test MSE: 15.49781567621771


In [8]:
# Random Forest Regressor

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 10,000 Values Samples
df_assault = df_assault.sample(n=10000, random_state=42)

# Select features and target variables
X = df_assault.drop(columns=['LAT', 'LON'])
y_lat = df_assault['LAT']
y_lon = df_assault['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Initialize lists to take averages easier later on
lat_train_r2_list = []
lat_test_r2_list = []
lat_train_mse_list = []
lat_test_mse_list = []

lon_train_r2_list = []
lon_test_r2_list = []
lon_train_mse_list = []
lon_test_mse_list = []

# Run the model 5 times
for i in range(5):
    X_train, X_test, y_lat_train, y_lat_test = train_test_split(X, y_lat, test_size=0.2, random_state=i)
    X_train, X_test, y_lon_train, y_lon_test = train_test_split(X, y_lon, test_size=0.2, random_state=i)

    # Initialize and train the model for LAT using Random Forest Regressor
    model_lat = RandomForestRegressor(random_state=i)
    model_lat.fit(X_train, y_lat_train)
    y_lat_pred_train = model_lat.predict(X_train)
    y_lat_pred_test = model_lat.predict(X_test)

    # Initialize and train the model for LON using Random Forest Regressor
    model_lon = RandomForestRegressor(random_state=i)
    model_lon.fit(X_train, y_lon_train)
    y_lon_pred_train = model_lon.predict(X_train)
    y_lon_pred_test = model_lon.predict(X_test)

    # Evaluate the model for LAT
    lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
    lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
    lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
    lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

    # Evaluate the model for LON
    lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
    lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
    lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
    lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

    # Append the metrics to the lists
    lat_train_r2_list.append(lat_train_r2)
    lat_test_r2_list.append(lat_test_r2)
    lat_train_mse_list.append(lat_train_mse)
    lat_test_mse_list.append(lat_test_mse)

    lon_train_r2_list.append(lon_train_r2)
    lon_test_r2_list.append(lon_test_r2)
    lon_train_mse_list.append(lon_train_mse)
    lon_test_mse_list.append(lon_test_mse)

    # Print the results
    print(f"Run {i+1}")
    print(f"LAT:")
    print(f"  Train R^2: {lat_train_r2:.3f}")
    print(f"  Test R^2: {lat_test_r2:.3f}")
    print(f"  Train MSE: {lat_train_mse:.3f}")
    print(f"  Test MSE: {lat_test_mse:.3f}")

    print(f"LON:")
    print(f"  Train R^2: {lon_train_r2:.3f}")
    print(f"  Test R^2: {lon_test_r2:.3f}")
    print(f"  Train MSE: {lon_train_mse:.3f}")
    print(f"  Test MSE: {lon_test_mse:.3f}")
    print("-" * 40)

# Calculate and print averages of 5 runs
average_lat_train_r2 = np.mean(lat_train_r2_list)
average_lat_test_r2 = np.mean(lat_test_r2_list)
average_lat_train_mse = np.mean(lat_train_mse_list)
average_lat_test_mse = np.mean(lat_test_mse_list)

average_lon_train_r2 = np.mean(lon_train_r2_list)
average_lon_test_r2 = np.mean(lon_test_r2_list)
average_lon_train_mse = np.mean(lon_train_mse_list)
average_lon_test_mse = np.mean(lon_test_mse_list)

print("Average Results")
print(f"LAT:")
print(f"  Train R^2: {average_lat_train_r2:.3f}")
print(f"  Test R^2: {average_lat_test_r2:.3f}")
print(f"  Train MSE: {average_lat_train_mse:.3f}")
print(f"  Test MSE: {average_lat_test_mse:.3f}")

print(f"LON:")
print(f"  Train R^2: {average_lon_train_r2:.3f}")
print(f"  Test R^2: {average_lon_test_r2:.3f}")
print(f"  Train MSE: {average_lon_train_mse:.3f}")
print(f"  Test MSE: {average_lon_test_mse:.3f}")


LAT:
  Train R^2: 0.8403685028250377
  Test R^2: 0.030932469354816305
  Train MSE: 0.1410073027483982
  Test MSE: 0.5765204213566909
LON:
  Train R^2: 0.837660123164662
  Test R^2: 0.0016135085054074239
  Train MSE: 1.7054999228221475
  Test MSE: 6.998829028242994


In [7]:
# Decision Tree Regressor

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 10,000 Values Samples
df_assault = df_assault.sample(n=10000, random_state=42)

# Select features and target variables
X = df_assault.drop(columns=['LAT', 'LON'])
y_lat = df_assault['LAT']
y_lon = df_assault['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Initialize lists to take averages easier later on
lat_train_r2_list = []
lat_test_r2_list = []
lat_train_mse_list = []
lat_test_mse_list = []

lon_train_r2_list = []
lon_test_r2_list = []
lon_train_mse_list = []
lon_test_mse_list = []

# Run the model 5 times
for i in range(5):
    X_train, X_test, y_lat_train, y_lat_test = train_test_split(X, y_lat, test_size=0.2, random_state=i)
    X_train, X_test, y_lon_train, y_lon_test = train_test_split(X, y_lon, test_size=0.2, random_state=i)

    # Initialize and train the model for LAT using Decision Tree Regressor
    model_lat = DecisionTreeRegressor(random_state=i)
    model_lat.fit(X_train, y_lat_train)
    y_lat_pred_train = model_lat.predict(X_train)
    y_lat_pred_test = model_lat.predict(X_test)

    # Initialize and train the model for LON using Decision Tree Regressor
    model_lon = DecisionTreeRegressor(random_state=i)
    model_lon.fit(X_train, y_lon_train)
    y_lon_pred_train = model_lon.predict(X_train)
    y_lon_pred_test = model_lon.predict(X_test)

    # Evaluate the model for LAT
    lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
    lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
    lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
    lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

    # Evaluate the model for LON
    lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
    lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
    lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
    lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

    # Append the metrics to the lists
    lat_train_r2_list.append(lat_train_r2)
    lat_test_r2_list.append(lat_test_r2)
    lat_train_mse_list.append(lat_train_mse)
    lat_test_mse_list.append(lat_test_mse)

    lon_train_r2_list.append(lon_train_r2)
    lon_test_r2_list.append(lon_test_r2)
    lon_train_mse_list.append(lon_train_mse)
    lon_test_mse_list.append(lon_test_mse)

    # Print the results
    print(f"Run {i+1}")
    print(f"LAT:")
    print(f"  Train R^2: {lat_train_r2:.3f}")
    print(f"  Test R^2: {lat_test_r2:.3f}")
    print(f"  Train MSE: {lat_train_mse:.3f}")
    print(f"  Test MSE: {lat_test_mse:.3f}")

    print(f"LON:")
    print(f"  Train R^2: {lon_train_r2:.3f}")
    print(f"  Test R^2: {lon_test_r2:.3f}")
    print(f"  Train MSE: {lon_train_mse:.3f}")
    print(f"  Test MSE: {lon_test_mse:.3f}")
    print("-" * 40)

# Calculate and print averages of 5 runs
average_lat_train_r2 = np.mean(lat_train_r2_list)
average_lat_test_r2 = np.mean(lat_test_r2_list)
average_lat_train_mse = np.mean(lat_train_mse_list)
average_lat_test_mse = np.mean(lat_test_mse_list)

average_lon_train_r2 = np.mean(lon_train_r2_list)
average_lon_test_r2 = np.mean(lon_test_r2_list)
average_lon_train_mse = np.mean(lon_train_mse_list)
average_lon_test_mse = np.mean(lon_test_mse_list)

print("Average Results")
print(f"LAT:")
print(f"  Train R^2: {average_lat_train_r2:.3f}")
print(f"  Test R^2: {average_lat_test_r2:.3f}")
print(f"  Train MSE: {average_lat_train_mse:.3f}")
print(f"  Test MSE: {average_lat_test_mse:.3f}")

print(f"LON:")
print(f"  Train R^2: {average_lon_train_r2:.3f}")
print(f"  Test R^2: {average_lon_test_r2:.3f}")
print(f"  Train MSE: {average_lon_train_mse:.3f}")
print(f"  Test MSE: {average_lon_test_mse:.3f}")


LAT:
  Train R^2: 1.0
  Test R^2: 0.029223606313762085
  Train MSE: 1.262177448353619e-31
  Test MSE: 0.576503386475
LON:
  Train R^2: 1.0
  Test R^2: 0.0017933729005902288
  Train MSE: 8.103179218430233e-30
  Test MSE: 6.99624549957
