In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier  # Importera KNeighborsClassifier
from sklearn.svm import SVC  # Importera Support Vector Machine (SVC)

In [4]:
# Steg 1: Ladda datasetet
data = pd.read_csv('data.csv')

# Kontrollera kolumnnamnen
print("Kolumnnamn i datasetet:", data.columns)

# Steg 2: Utforsk data
print("Första 5 rader i datasetet:")
print(data.head())
print("Information om datasetet:")
print(data.info())
print("Statistisk sammanfattning:")
print(data.describe())

Kolumnnamn i datasetet: Index(['DR_NO', 'Date_Reported', 'Date_occured', 'Time_occured', 'Area',
       'Crime_Code', 'Victim_age', 'Victim_sex', 'Victim_descent', 'Premis',
       'Weapon', 'Status', 'LOCATION', 'LAT', 'LON'],
      dtype='object')
Första 5 rader i datasetet:
       DR_NO           Date_Reported            Date_occured  Time_occured  \
0  190326475           03/01/20 0:00           03/01/20 0:00          2130   
1  200106753           02/09/20 0:00           02/08/20 0:00          1800   
2  200320258           11/11/20 0:00           11/04/20 0:00          1700   
3  200907217           05/10/23 0:00           03/10/20 0:00          2037   
4  220614831  08/18/2022 12:00:00 AM  08/17/2020 12:00:00 AM          1200   

        Area                                Crime_Code  Victim_age Victim_sex  \
0   Wilshire                          VEHICLE - STOLEN           0          M   
1    Central                     BURGLARY FROM VEHICLE          47          M   
2  Southwe

In [5]:
# Konvertera 'Date_occured' och 'Date_Reported' till datetime-format
data['Date_occured'] = pd.to_datetime(data['Date_occured'], format='%m/%d/%y %H:%M', errors='coerce')
data['Date_Reported'] = pd.to_datetime(data['Date_Reported'], format='%m/%d/%y %H:%M', errors='coerce')

# Kontrollera om det finns NaT-värden efter konvertering
print("Rader med saknade värden i 'Date_occured':")
print(data[data['Date_occured'].isna()])

print("Rader med saknade värden i 'Date_Reported':")
print(data[data['Date_Reported'].isna()])

Rader med saknade värden i 'Date_occured':
            DR_NO Date_Reported Date_occured  Time_occured        Area  \
4       220614831           NaT          NaT          1200   Hollywood   
9       211904005           NaT          NaT          1220     Mission   
17      210705560           NaT          NaT          1800    Wilshire   
21      231907172           NaT          NaT          1200     Mission   
26      222106031           NaT          NaT          1300     Topanga   
...           ...           ...          ...           ...         ...   
974471  242011172           NaT          NaT          2300     Olympic   
974472  240710284           NaT          NaT          1400    Wilshire   
974473  240104953           NaT          NaT           100     Central   
974474  241711348           NaT          NaT           757  Devonshire   
974475  240309674           NaT          NaT          1500   Southwest   

                                               Crime_Code  Victim_ag

In [6]:
# Skapa nya features baserat på 'Date_occured'
data['Year'] = data['Date_occured'].dt.year
data['Month'] = data['Date_occured'].dt.month
data['Day'] = data['Date_occured'].dt.day

# Omvandla 'Time_occured' till tidsformat (om det behövs)
data['Time_occured'] = data['Time_occured'].astype(str).str.zfill(4)  # Lägger till ledande nollor
data['Hour'] = data['Time_occured'].str[:2].astype(int)
data['Minute'] = data['Time_occured'].str[2:].astype(int)

# Visa resultat
print("Transformerat dataset:")
print(data.head())

Transformerat dataset:
       DR_NO Date_Reported Date_occured Time_occured       Area  \
0  190326475    2020-03-01   2020-03-01         2130   Wilshire   
1  200106753    2020-02-09   2020-02-08         1800    Central   
2  200320258    2020-11-11   2020-11-04         1700  Southwest   
3  200907217    2023-05-10   2020-03-10         2037   Van Nuys   
4  220614831           NaT          NaT         1200  Hollywood   

                                 Crime_Code  Victim_age Victim_sex  \
0                          VEHICLE - STOLEN           0          M   
1                     BURGLARY FROM VEHICLE          47          M   
2                             BIKE - STOLEN          19          X   
3  SHOPLIFTING-GRAND THEFT ($950.01 & OVER)          19          M   
4                         THEFT OF IDENTITY          28          M   

  Victim_descent                                        Premis Weapon  \
0              O                                        STREET    NaN   
1      