In [1]:
# import pandas and upload a dataset to a dataframe

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading Data into df
df = pd.read_csv('cleanedcrimedata.csv')
df = df.drop(columns=['Unnamed: 0', 'Date Rptd', 'DATE OCC', 'Mocodes', 'Vict Sex', 'Vict Descent'])

print(df.dtypes)

DR_NO            int64
TIME OCC         int64
AREA             int64
AREA NAME       object
Rpt Dist No      int64
Part 1-2         int64
Crm Cd           int64
Crm Cd Desc     object
Vict Age         int64
Premis Cd      float64
Premis Desc     object
Status          object
Status Desc     object
Crm Cd 1       float64
LOCATION        object
LAT            float64
LON            float64
dtype: object


In [3]:
top_25_value_counts = df['Crm Cd Desc'].value_counts().head(25)

print(top_25_value_counts)

VEHICLE - STOLEN                                            102886
BATTERY - SIMPLE ASSAULT                                     74541
BURGLARY FROM VEHICLE                                        58578
THEFT OF IDENTITY                                            58518
BURGLARY                                                     57527
VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)      57439
ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT               53208
THEFT PLAIN - PETTY ($950 & UNDER)                           48464
INTIMATE PARTNER - SIMPLE ASSAULT                            46650
THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)              36887
THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)          34042
ROBBERY                                                      31982
THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD     31840
SHOPLIFTING - PETTY THEFT ($950 & UNDER)                     25817
VANDALISM - MISDEAMEANOR ($399 OR UNDER)                     2

In [4]:
# Filter for 'BATTERY - SIMPLE ASSAULT'
df_battery = df[df['Crm Cd Desc'] == 'BATTERY - SIMPLE ASSAULT']

# Count NaN values in each column
nan_counts = df_battery.isna().sum()
print(nan_counts)

# Drop rows with any NaN values
df_battery = df_battery.dropna()

# Drop the 'Crm Cd Desc' column
df_battery = df_battery.drop(columns=['Crm Cd Desc'])

# Display the cleaned DataFrame
print(df_battery)

DR_NO           0
TIME OCC        0
AREA            0
AREA NAME       0
Rpt Dist No     0
Part 1-2        0
Crm Cd          0
Crm Cd Desc     0
Vict Age        0
Premis Cd       0
Premis Desc    18
Status          0
Status Desc     0
Crm Cd 1        0
LOCATION        0
LAT             0
LON             0
dtype: int64
            DR_NO  TIME OCC  AREA    AREA NAME  Rpt Dist No  Part 1-2  Crm Cd  \
9       211904005      1220    19      Mission         1974         2     624   
29      220808837      1630     8      West LA          842         2     624   
196     211206198      1400    12  77th Street         1265         2     624   
206     221715311       200    17   Devonshire         1712         2     624   
247     231107419      2346    11    Northeast         1122         2     624   
...           ...       ...   ...          ...          ...       ...     ...   
947699  241508672      1730    15  N Hollywood         1555         2     624   
947718  240706977      1540     7

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Sampling 10,000 values from the dataframe
df_battery = df_battery.sample(n=10000, random_state=42)

# Select features and target variables
X = df_battery.drop(columns=['LAT', 'LON'])
y_lat = df_battery['LAT']
y_lon = df_battery['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_lat_train, y_lat_test = train_test_split(X_scaled, y_lat, test_size=0.2, random_state=42)
_, _, y_lon_train, y_lon_test = train_test_split(X_scaled, y_lon, test_size=0.2, random_state=42)

# Initialize and train the model for LAT
model_lat = LinearRegression()
model_lat.fit(X_train, y_lat_train)
y_lat_pred_train = model_lat.predict(X_train)
y_lat_pred_test = model_lat.predict(X_test)

# Initialize and train the model for LON
model_lon = LinearRegression()
model_lon.fit(X_train, y_lon_train)
y_lon_pred_train = model_lon.predict(X_train)
y_lon_pred_test = model_lon.predict(X_test)

# Evaluate the model for LAT
lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

# Evaluate the model for LON
lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

# Print the results
print(f"LAT:")
print(f"  Train R^2: {lat_train_r2}")
print(f"  Test R^2: {lat_test_r2}")
print(f"  Train MSE: {lat_train_mse}")
print(f"  Test MSE: {lat_test_mse}")

print(f"LON:")
print(f"  Train R^2: {lon_train_r2}")
print(f"  Test R^2: {lon_test_r2}")
print(f"  Train MSE: {lon_train_mse}")
print(f"  Test MSE: {lon_test_mse}")

LAT:
  Train R^2: -2.5238114241824645
  Test R^2: -5.744201197987403e+29
  Train MSE: 17.85568439131715
  Test MSE: 2.9936243658984243e+30
LON:
  Train R^2: -2.161664207561578
  Test R^2: -5.756948312191049e+29
  Train MSE: 192.90950678430985
  Test MSE: 3.612513371297735e+31


In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Sampling 10,000 values from the dataframe
df_battery = df_battery.sample(n=10000, random_state=42)

# Select features and target variables
X = df_battery.drop(columns=['LAT', 'LON'])
y_lat = df_battery['LAT']
y_lon = df_battery['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_lat_train, y_lat_test = train_test_split(X, y_lat, test_size=0.2, random_state=42)
_, _, y_lon_train, y_lon_test = train_test_split(X, y_lon, test_size=0.2, random_state=42)

# Initialize and train the model for LAT
model_lat = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10)
model_lat.fit(X_train, y_lat_train)
y_lat_pred_train = model_lat.predict(X_train)
y_lat_pred_test = model_lat.predict(X_test)

# Initialize and train the model for LON
model_lon = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10)
model_lon.fit(X_train, y_lon_train)
y_lon_pred_train = model_lon.predict(X_train)
y_lon_pred_test = model_lon.predict(X_test)

# Evaluate the model for LAT
lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

# Evaluate the model for LON
lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

# Print the results
print(f"LAT:")
print(f"  Train R^2: {lat_train_r2}")
print(f"  Test R^2: {lat_test_r2}")
print(f"  Train MSE: {lat_train_mse}")
print(f"  Test MSE: {lat_test_mse}")

print(f"LON:")
print(f"  Train R^2: {lon_train_r2}")
print(f"  Test R^2: {lon_test_r2}")
print(f"  Train MSE: {lon_train_mse}")
print(f"  Test MSE: {lon_test_mse}")

LAT:
  Train R^2: 0.45137207745392405
  Test R^2: -0.08897382516276586
  Train MSE: 2.780022926120758
  Test MSE: 5.674915431892419
LON:
  Train R^2: 0.45226125289000774
  Test R^2: -0.09025351335628184
  Train MSE: 33.420110457339895
  Test MSE: 68.4161261158517


In [10]:
# Decision Tree Regressor

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Sampling 10,000 values from the dataframe
df_battery = df_battery.sample(n=10000, random_state=42)

# Select features and target variables
X = df_battery.drop(columns=['LAT', 'LON'])
y_lat = df_battery['LAT']
y_lon = df_battery['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_lat_train, y_lat_test = train_test_split(X, y_lat, test_size=0.2, random_state=42)
X_train, X_test, y_lon_train, y_lon_test = train_test_split(X, y_lon, test_size=0.2, random_state=42)

# Initialize and train the model for LAT
model_lat = DecisionTreeRegressor(random_state=42)
model_lat.fit(X_train, y_lat_train)
y_lat_pred_train = model_lat.predict(X_train)
y_lat_pred_test = model_lat.predict(X_test)

# Initialize and train the model for LON
model_lon = DecisionTreeRegressor(random_state=42)
model_lon.fit(X_train, y_lon_train)
y_lon_pred_train = model_lon.predict(X_train)
y_lon_pred_test = model_lon.predict(X_test)

# Evaluate the model for LAT
lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

# Evaluate the model for LON
lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

# Print the results
print(f"LAT:")
print(f"  Train R^2: {lat_train_r2}")
print(f"  Test R^2: {lat_test_r2}")
print(f"  Train MSE: {lat_train_mse}")
print(f"  Test MSE: {lat_test_mse}")

print(f"LON:")
print(f"  Train R^2: {lon_train_r2}")
print(f"  Test R^2: {lon_test_r2}")
print(f"  Train MSE: {lon_train_mse}")
print(f"  Test MSE: {lon_test_mse}")

LAT:
  Train R^2: 0.9999999999921062
  Test R^2: -0.7785745496455341
  Train MSE: 4.0000000000524094e-11
  Test MSE: 9.268488273674999
LON:
  Train R^2: 0.9999999999993444
  Test R^2: -1.008218017208229
  Train MSE: 3.999999999981355e-11
  Test MSE: 126.020410867765
