In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
import warnings 
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import random 
import folium
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
random.seed(42)
import os

In [3]:
df = pd.read_csv('../data/crime_in_la.csv')

## Filtrage des données

In [4]:
df['Crm Cd'] = df['Crm Cd'].astype(str)
crime_df = df[df["Crm Cd"] == "510"]
crime_df = crime_df[crime_df['Premis Desc'].notna()]
df_f = crime_df.copy()
df_f = df_f[(df_f['Premis Desc'] == "STREET") | (df_f['Premis Desc'] == "PARKING LOT") | (df_f['Premis Desc'] == "DRIVEWAY")]
df_f = df_f[(df_f['LAT']!=0) & (df_f['LON']!=0)]

df_f['date'] = pd.to_datetime(df_f['DATE OCC'], format='%d/%m/%Y')
df_f.set_index('date', inplace=True)

In [5]:
# Train begin on the 01/01/2022
date = '01-01-2022'

train = df_f[df_f.index < date].copy()
test = df_f[df_f.index >= date].copy()

def get_data(df, labels= None):
    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    

    X = df[['dayofweek','quarter','month','year', 'dayofyear','dayofmonth','weekofyear']]
    # df['LAT'] = df.LAT.apply(lambda x: (x-33)*10000)
    # df['LON'] = df.LON.apply(lambda x: (x+118)*10000)
    if labels:
        y = df[labels]
        return X, y
    return X

X_train, y_train = get_data(train, labels=['LAT', 'LON'])
X_test, y_test = get_data(test, labels=['LAT', 'LON'])

## Modèle 

#### KNN

### Train

In [6]:
from geopy.distance import geodesic as GD
def from_geo_to_m(lon1,lon2,lat1,lat2) :
    """
    Conversion de coordonnées géographiques en mètres
    
    Parameters
    ----------
    lon1 : float
        Longitude du point 1
    lon2 : float
        Longitude du point 2
    lat1 : float
        Latitude du point 1
    lat2 : float
        Latitude du point 2
    
    Returns
    -------
    float
        Distance en mètres entre les deux points
    """
    return GD((lat1,lon1),(lat2,lon2)).m
# from_geo_to_m(pred[0][1], train['LON'].iloc[0], pred[0][0], train['LAT'].iloc[0])

In [7]:
model_knn = KNeighborsRegressor(
    n_neighbors=100, 
    n_jobs=None, 
    weights='distance', 
    algorithm='kd_tree', # or ball_tree
    leaf_size=10, 
    p=2, 
    metric='minkowski', 
    metric_params=None,

)

model_knn.fit(
    X_train,
    y_train,
)

pred = model_knn.predict(X_train)
lat = [round(coord[0], 4) for coord in pred]
lon = [round(coord[1], 4) for coord in pred]
# error = np.sqrt((train['LAT'] - lat)**2 + (train['LON'] - lon)**2)
error = [from_geo_to_m(lon[i], train['LON'].iloc[i], lat[i], train['LAT'].iloc[i]) for i in range(len(lat))]
print(f"Distance en m: {round(sum(error)/len(error), 2)}")

Distance en m: 13868.23


On obtient une erreur moyenne de 13,8 km sur le train set.

In [8]:
train['LAT_pred'] = lat
train['LON_pred'] = lon
train.LAT_pred = train.LAT_pred.apply(lambda x: round(x, 4))
train.LON_pred = train.LON_pred.apply(lambda x: round(x, 4))
train["Distance"] = error

n = folium.Map(
    location=[34.047598, -118.245564], 
    zoom_start=10,
    zoom_control=True,
    scrollWheelZoom=False,
    dragging=True
)
testview = train.sample(1000, random_state=42)
long_lat = testview[['LAT','LON']].values
long_lat_pred = testview[['LAT_pred','LON_pred',"Distance"]].values

colors = {(10001, 1e10): 'blue', (5001, 10000): 'purple', (3001, 5001): 'red', (1001, 3000): 'orange', (1, 1000): 'green'}

i = 0
for l_l_d in long_lat_pred:
    for key, value in colors.items():
        if key[0] <= l_l_d[2] <= key[1]:
            folium.CircleMarker(location=l_l_d[:2], radius=2, color=value).add_to(n)
            i+=1
n

### Test

In [10]:
model_knn = KNeighborsRegressor(
    n_neighbors=2, 
    n_jobs=None, 
    weights='distance', 
    algorithm='kd_tree', # or ball_tree
    # leaf_size=10, 
    p=1, 
    metric='minkowski', 
    metric_params=None, 
)

model_knn.fit(
    X_train,
    y_train,
)

pred = model_knn.predict(X_test)
lat = [round(coord[0], 4) for coord in pred]
lon = [round(coord[1], 4) for coord in pred]
error = np.sqrt((test['LAT'] - lat)**2 + (test['LON'] - lon)**2)
error = [from_geo_to_m(lon[i], test['LON'].iloc[i], lat[i], test['LAT'].iloc[i]) for i in range(len(lat))]
print(f"Distance en m: {round(sum(error)/len(error), 2)}")


test['LAT_pred'] = lat
test['LON_pred'] = lon
test.LAT_pred = test.LAT_pred.apply(lambda x: round(x, 4))
test.LON_pred = test.LON_pred.apply(lambda x: round(x, 4))
error = [from_geo_to_m(lon[i], test['LON'].iloc[i], lat[i], test['LAT'].iloc[i]) for i in range(len(lat))]
test["Distance"] = error


n = folium.Map(
    location=[34.047598, -118.245564], 
    zoom_start=10,
    zoom_control=True,
    scrollWheelZoom=False,
    dragging=True
)

# testview = test.sample(1000, random_state=42)
# long_lat = testview[['LAT','LON']].values
# long_lat_pred = testview[['LAT_pred','LON_pred']].values
# for l_l in long_lat:
#     folium.CircleMarker(location=l_l, radius=2, color='red').add_to(n)

# for l_l in long_lat_pred:
#     folium.CircleMarker(location=l_l, radius=2, color='blue').add_to(n)
# n

# colors = {(3001, 1e10): 'blue', (1001, 3000): 'purple', (501, 1000): 'red', (301, 500): 'orange', (1, 300): 'green'}
# colors = {(10001, 1e10): 'blue', (5001, 10000): 'purple', (3001, 5001): 'red', (1001, 3000): 'orange', (1, 1000): 'green'}
# testview = test
# long_lat = testview[['LAT','LON']].values
# long_lat_pred = testview[['LAT_pred','LON_pred']].values

# for l_l in long_lat:
#     folium.CircleMarker(location=l_l, radius=2, color='blue').add_to(n)
# for ll_pred in long_lat_pred:
# #   d_meters = from_geo_to_m(ll_pred[],ll_pred[],ll_reel[],ll_reel)
#   folium.CircleMarker(location=ll_pred, radius=2, color="red").add_to(n)
    

# # i = 0
# # for l_l_d in long_lat_pred:
# #     for key, value in colors.items():
# #         if key[0] <= l_l_d[2] <= key[1]:
# #             folium.CircleMarker(location=l_l_d[:2], radius=2, color=value).add_to(n)
# #             i+=1
# n

Distance en m: 16486.04


On obtient une erreur moyenne de 16 km sur le test set.

In [11]:
n = folium.Map(
    location=[34.047598, -118.245564], 
    zoom_start=10,
    zoom_control=True,
    scrollWheelZoom=False,
    dragging=True
)
testview = test.sample(1000, random_state=42)
long_lat = testview[['LAT','LON']].values
long_lat_pred = testview[['LAT_pred','LON_pred',"Distance"]].values

colors = {(10001, 1e10): 'blue', (5001, 10000): 'purple', (3001, 5001): 'red', (1001, 3000): 'orange', (1, 1000): 'green'}

i = 0
for l_l_d in long_lat_pred:
    for key, value in colors.items():
        if key[0] <= l_l_d[2] <= key[1]:
            folium.CircleMarker(location=l_l_d[:2], radius=2, color=value).add_to(n)
            i+=1
n

On peut voir ici que les distances sont très élevées entre les points prédits et les points réels. En revanche on peut voir que les points prédits sont bien dans la bonne zone géographique. 

Le model n'est pas satisfaisant. Cela peut être du au fait qu'il n'existe pas de pattern entre le vol de voiture et les données temporelles