In [3]:
# import pandas and upload a dataset to a dataframe

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Loading Data into df
df = pd.read_csv('cleanedcrimedata.csv')
df = df.drop(columns=['Unnamed: 0', 'Date Rptd', 'DATE OCC', 'Mocodes', 'Vict Sex', 'Vict Descent'])

print(df.dtypes)

DR_NO            int64
TIME OCC         int64
AREA             int64
AREA NAME       object
Rpt Dist No      int64
Part 1-2         int64
Crm Cd           int64
Crm Cd Desc     object
Vict Age         int64
Premis Cd      float64
Premis Desc     object
Status          object
Status Desc     object
Crm Cd 1       float64
LOCATION        object
LAT            float64
LON            float64
dtype: object


In [5]:
top_25_value_counts = df['Crm Cd Desc'].value_counts().head(25)

print(top_25_value_counts)

VEHICLE - STOLEN                                            102886
BATTERY - SIMPLE ASSAULT                                     74541
BURGLARY FROM VEHICLE                                        58578
THEFT OF IDENTITY                                            58518
BURGLARY                                                     57527
VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)      57439
ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT               53208
THEFT PLAIN - PETTY ($950 & UNDER)                           48464
INTIMATE PARTNER - SIMPLE ASSAULT                            46650
THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)              36887
THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)          34042
ROBBERY                                                      31982
THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD     31840
SHOPLIFTING - PETTY THEFT ($950 & UNDER)                     25817
VANDALISM - MISDEAMEANOR ($399 OR UNDER)                     2

In [6]:
# Filter for 'BURGLARY'
df_BURGLARY = df[df['Crm Cd Desc'] == 'BURGLARY']

# Count NaN values in each column
nan_counts = df_BURGLARY.isna().sum()
print(nan_counts)

# Drop rows with any NaN values
df_BURGLARY = df_BURGLARY.dropna()

# Drop the 'Crm Cd Desc' column
df_BURGLARY = df_BURGLARY.drop(columns=['Crm Cd Desc'])

# Display the cleaned DataFrame
print(df_BURGLARY)

DR_NO            0
TIME OCC         0
AREA             0
AREA NAME        0
Rpt Dist No      0
Part 1-2         0
Crm Cd           0
Crm Cd Desc      0
Vict Age         0
Premis Cd        0
Premis Desc    282
Status           0
Status Desc      0
Crm Cd 1         0
LOCATION         0
LAT              0
LON              0
dtype: int64
            DR_NO  TIME OCC  AREA    AREA NAME  Rpt Dist No  Part 1-2  Crm Cd  \
47      230610629       430     6    Hollywood          645         1     310   
72      211404152      1600    14      Pacific         1463         1     310   
148     210704053       400     7     Wilshire          722         1     310   
162     231207476         1    12  77th Street         1273         1     310   
183     220904776      1200     9     Van Nuys          974         1     310   
...           ...       ...   ...          ...          ...       ...     ...   
947610  240804146      1730     8      West LA          811         1     310   
947633  24070664

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Sampling 10,000 values from the dataframe
df_BURGLARY = df_BURGLARY.sample(n=10000, random_state=42)

# Select features and target variables
X = df_BURGLARY.drop(columns=['LAT', 'LON'])
y_lat = df_BURGLARY['LAT']
y_lon = df_BURGLARY['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_lat_train, y_lat_test = train_test_split(X_scaled, y_lat, test_size=0.2, random_state=42)
_, _, y_lon_train, y_lon_test = train_test_split(X_scaled, y_lon, test_size=0.2, random_state=42)

# Initialize and train the model for LAT
model_lat = LinearRegression()
model_lat.fit(X_train, y_lat_train)
y_lat_pred_train = model_lat.predict(X_train)
y_lat_pred_test = model_lat.predict(X_test)

# Initialize and train the model for LON
model_lon = LinearRegression()
model_lon.fit(X_train, y_lon_train)
y_lon_pred_train = model_lon.predict(X_train)
y_lon_pred_test = model_lon.predict(X_test)

# Evaluate the model for LAT
lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

# Evaluate the model for LON
lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

# Print the results
print(f"LAT:")
print(f"  Train R^2: {lat_train_r2}")
print(f"  Test R^2: {lat_test_r2}")
print(f"  Train MSE: {lat_train_mse}")
print(f"  Test MSE: {lat_test_mse}")

print(f"LON:")
print(f"  Train R^2: {lon_train_r2}")
print(f"  Test R^2: {lon_test_r2}")
print(f"  Train MSE: {lon_train_mse}")
print(f"  Test MSE: {lon_test_mse}")


LAT:
  Train R^2: -4.8572666060396426
  Test R^2: -3.05603913093071e+31
  Train MSE: 2.621482175399301
  Test MSE: 3.57230333782738e+29
LON:
  Train R^2: -4.633654493965045
  Test R^2: -3.655376839863193e+32
  Train MSE: 29.65962320627644
  Test MSE: 4.3429464352156767e+30


In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df_BURGLARY is your DataFrame with the 'BURGLARY' data

# Sampling 10,000 values from the dataframe
df_BURGLARY = df_BURGLARY.sample(n=10000, random_state=42)

# Select features and target variables
X = df_BURGLARY.drop(columns=['LAT', 'LON'])
y_lat = df_BURGLARY['LAT']
y_lon = df_BURGLARY['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_lat_train, y_lat_test = train_test_split(X_scaled, y_lat, test_size=0.2, random_state=42)
_, _, y_lon_train, y_lon_test = train_test_split(X_scaled, y_lon, test_size=0.2, random_state=42)

# Initialize and train the model for LAT
model_lat = RandomForestRegressor(random_state=42)
model_lat.fit(X_train, y_lat_train)
y_lat_pred_train = model_lat.predict(X_train)
y_lat_pred_test = model_lat.predict(X_test)

# Initialize and train the model for LON
model_lon = RandomForestRegressor(random_state=42)
model_lon.fit(X_train, y_lon_train)
y_lon_pred_train = model_lon.predict(X_train)
y_lon_pred_test = model_lon.predict(X_test)

# Evaluate the model for LAT
lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

# Evaluate the model for LON
lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

# Print the results
print(f"LAT:")
print(f"  Train R^2: {lat_train_r2}")
print(f"  Test R^2: {lat_test_r2}")
print(f"  Train MSE: {lat_train_mse}")
print(f"  Test MSE: {lat_test_mse}")

print(f"LON:")
print(f"  Train R^2: {lon_train_r2}")
print(f"  Test R^2: {lon_test_r2}")
print(f"  Train MSE: {lon_train_mse}")
print(f"  Test MSE: {lon_test_mse}")


LAT:
  Train R^2: 0.8367573357484975
  Test R^2: 0.999020229829872
  Train MSE: 0.0730486463255179
  Test MSE: 1.1704473168500427e-05
LON:
  Train R^2: 0.8204283311321212
  Test R^2: 0.9910412276231422
  Train MSE: 0.9454294348060139
  Test MSE: 0.00010102901628795973


In [9]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df_BURGLARY is your DataFrame with the 'BURGLARY' data

# Sampling 10,000 values from the dataframe
df_BURGLARY = df_BURGLARY.sample(n=10000, random_state=42)

# Select features and target variables
X = df_BURGLARY.drop(columns=['LAT', 'LON'])
y_lat = df_BURGLARY['LAT']
y_lon = df_BURGLARY['LON']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80:20)
X_train, X_test, y_lat_train, y_lat_test = train_test_split(X_scaled, y_lat, test_size=0.2, random_state=42)
_, _, y_lon_train, y_lon_test = train_test_split(X_scaled, y_lon, test_size=0.2, random_state=42)

# Initialize and train the model for LAT
model_lat = DecisionTreeRegressor(random_state=42)
model_lat.fit(X_train, y_lat_train)
y_lat_pred_train = model_lat.predict(X_train)
y_lat_pred_test = model_lat.predict(X_test)

# Initialize and train the model for LON
model_lon = DecisionTreeRegressor(random_state=42)
model_lon.fit(X_train, y_lon_train)
y_lon_pred_train = model_lon.predict(X_train)
y_lon_pred_test = model_lon.predict(X_test)

# Evaluate the model for LAT
lat_train_r2 = r2_score(y_lat_train, y_lat_pred_train)
lat_test_r2 = r2_score(y_lat_test, y_lat_pred_test)
lat_train_mse = mean_squared_error(y_lat_train, y_lat_pred_train)
lat_test_mse = mean_squared_error(y_lat_test, y_lat_pred_test)

# Evaluate the model for LON
lon_train_r2 = r2_score(y_lon_train, y_lon_pred_train)
lon_test_r2 = r2_score(y_lon_test, y_lon_pred_test)
lon_train_mse = mean_squared_error(y_lon_train, y_lon_pred_train)
lon_test_mse = mean_squared_error(y_lon_test, y_lon_pred_test)

# Print the results
print(f"LAT:")
print(f"  Train R^2: {lat_train_r2}")
print(f"  Test R^2: {lat_test_r2}")
print(f"  Train MSE: {lat_train_mse}")
print(f"  Test MSE: {lat_test_mse}")

print(f"LON:")
print(f"  Train R^2: {lon_train_r2}")
print(f"  Test R^2: {lon_test_r2}")
print(f"  Train MSE: {lon_train_mse}")
print(f"  Test MSE: {lon_test_mse}")


LAT:
  Train R^2: 1.0
  Test R^2: 0.9983033489715756
  Train MSE: 7.131302583197947e-31
  Test MSE: 1.883408499999888e-05
LON:
  Train R^2: 1.0
  Test R^2: 0.9908351590569279
  Train MSE: 6.0584517520973704e-30
  Test MSE: 0.00010550220500000523
