#### Import required modules

In [None]:
import pandas as pd
import matplotlib
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
df = pd.read_csv(r"C:\Users\TOBENNA\Desktop\FINAL YEAR PROJECT\CRIME IN LA\dataset.csv")
df

In [None]:
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], format='mixed')
df['Year OCC'] = df['DATE OCC'].dt.year
df['Month OCC'] = df['DATE OCC'].dt.month
df['DAY OCC'] = df['DATE OCC'].dt.day
df['DAY WEEK OCC'] = df['DATE OCC'].dt.day_name()

df.drop(columns=['DATE OCC'], inplace=True)
df.head()

In [None]:
df.drop(columns=['Date Rptd'], inplace=True)
df.head()

In [None]:
print(df.dtypes)

In [None]:
# Group all the assualts
df.loc[df['Crm Cd Desc'].str.contains('ASSAULT'), 'Crm Cd Desc'] = 'ASSAULT'

# CONVERT VEHICLE RELATED ISSUES TO VEHICLE ALTERCATION
df.loc[df['Crm Cd Desc'].str.contains('VEHICLE'), 'Crm Cd Desc'] = 'VEHICLE ALTERCATION'

# Group all the burglary
df.loc[df['Crm Cd Desc'].str.contains('BURGLARY'), 'Crm Cd Desc'] = 'BURGLARY'

# CONVERT THEFT OF IDENTITY TO IMPERSONIFICATION
df.loc[df['Crm Cd Desc'].str.contains('THEFT OF IDENTITY'), 'Crm Cd Desc'] = 'IMPERSONIFICATION'

# GROUP ALL THEFTS
df.loc[df['Crm Cd Desc'].str.contains('THEFT'), 'Crm Cd Desc'] = 'THEFT'
# df.loc[df['Crm Cd Desc'].str.contains('ROBBERY'), 'Crm Cd Desc'] = 'THEFT'
df.loc[df['Crm Cd Desc'].str.contains('STOLEN'), 'Crm Cd Desc'] = 'THEFT'

df.loc[df['Crm Cd Desc'].str.contains('RAPE'), 'Crm Cd Desc'] = 'SEXUAL HARASSMENT'
df.loc[df['Crm Cd Desc'].str.contains('SEXUAL'), 'Crm Cd Desc'] = 'SEXUAL HARASSMENT'

df.loc[df['Crm Cd Desc'].str.contains('VANDALISM'), 'Crm Cd Desc'] = 'VANDALISM'


In [None]:
# Filter out crimes with counts less than 1000
crime_counts = df['Crm Cd Desc'].value_counts()

crimes_to_keep = crime_counts[crime_counts >= 3000].index
df = df[df['Crm Cd Desc'].isin(crimes_to_keep)]

# Print the filtered DataFrame
df

In [None]:
#removes duplicate rows according to "event_unique_id"
print('Original Data Size after dropping Duplicates')
df = df.drop_duplicates(subset='DR_NO',keep='first')
df.shape

In [None]:
drop_colmns = ['DR_NO','DAY OCC' ,'AREA','Premis Desc', 'Rpt Dist No','Part 1-2','Mocodes','Weapon Used Cd','Premis Cd','Crm Cd 1','Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4','Status Desc', 'Cross Street']
df_dropped = df.drop(columns=drop_colmns)
df_dropped.head()

In [None]:
df = df[df['Crm Cd Desc'] != 'VIOLATION OF COURT ORDER']
# df = df[df['Crm Cd Desc'] != 'SEX,UNLAWFUL(INC MUTUAL CONSENT, PENETRATION W/ FRGN OBJ']
df = df[df['Crm Cd Desc'] != 'LETTERS, LEWD  -  TELEPHONE CALLS, LEWD']
df = df[df['Crm Cd Desc'] != 'VIOLATION OF RESTRAINING ORDER ']
# df = df[df['Crm Cd Desc'] != 'CONTEMPT OF COURT ']

df

In [None]:
df['Crm Cd Desc'].value_counts()

In [None]:
df.shape

In [None]:
df_grouped = df_dropped.groupby(df_dropped['Year OCC'])

In [None]:
#Analysis by year
df_2020 = df_grouped.get_group(2020)
df_2021 = df_grouped.get_group(2021)
df_2022 = df_grouped.get_group(2022)
df_2023 = df_grouped.get_group(2023)
df_2024 = df_grouped.get_group(2024)

In [None]:
df_2020_grouped = df_2020.groupby(df_2020['Crm Cd Desc']).count()
# df_2021_grouped = df_2021.groupby(df_2021['Crm Cd Desc']).count()
# df_2022_grouped = df_2022.groupby(df_2022['Year OCC']).count()
# df_2023_grouped = df_2023.groupby(df_2023['Year OCC']).count()
# df_2024_grouped = df_2024.groupby(df_2024['Year OCC']).count()
# df_2020_grouped

In [None]:
plot = df_2020_grouped.iloc[:,0]
plot = pd.DataFrame(plot)
plot.columns = ['Number of Cases']
ax = plot.plot(kind='barh',figsize=(100,100),title='Number of Crimes in LA 2015')

In [None]:
df.head()

#### Convert to numerical data using factorization

In [None]:

col_list = ['Year OCC', 'Month OCC', 'DAY WEEK OCC', 'TIME OCC', 'Crm Cd Desc', 'AREA NAME', 'LOCATION','Premis Desc']


df2 = df[col_list]
df2 = df2[df2['Year OCC'] > 2021]

#Factorize dependent variable column:
crime_var = pd.factorize(df2['Crm Cd Desc'])
df2['Crm Cd Desc'] = crime_var[0]
definition_list_MCI = crime_var[1]

#factorize independent variables:
premise_var = pd.factorize(df2['Premis Desc'])
df2['Premis Desc'] = premise_var[0]
definition_list_premise = premise_var[1]

#factorize occurenceyear:
year_var = pd.factorize(df2['Year OCC'])
df2['Year OCC'] = year_var[0]
definition_list_year = year_var[1]

#factorize occurencemonth:
month_var = pd.factorize(df2['Month OCC'])
df2['Month OCC'] = month_var[0]
definition_list_month = month_var[1]

#factorize occurenceday:
# day_var = pd.factorize(df2['DAY OCC'])
# df2['DAY OCC'] = day_var[0]
# definition_list_day = day_var[1]

#factorize occurencedayofweek:
dayweek_var = pd.factorize(df2['DAY WEEK OCC'])
df2['DAY WEEK OCC'] = dayweek_var[0]
definition_list_day = dayweek_var[1]

#factorize division:
division_var = pd.factorize(df2['AREA NAME'])
df2['AREA NAME'] = division_var[0]
definition_list_division = division_var[1]

#factorize HOOD_ID:
hood_var = pd.factorize(df2['LOCATION'])
df2['LOCATION'] = hood_var[0]
definition_list_hood = hood_var[1]

#factorize occurencehour:
hour_var = pd.factorize(df2['TIME OCC'])
df2['TIME OCC'] = hour_var[0]
definition_list_hour = hour_var[1]


In [None]:
df2

#### Feature Scaling

In [None]:
# Extract the feature columns from df2
X = df2.drop(columns=['Crm Cd Desc'])

# Initialize StandardScaler
scaler = StandardScaler()

# Scale the feature columns
X_scaled = scaler.fit_transform(X)

# Replace the original feature columns with the scaled ones
df2[X.columns] = X_scaled

In [None]:
# Extract the feature columns for regular features
x = df2[['AREA NAME', 'Premis Desc', 'DAY WEEK OCC']].values
y = df2['Crm Cd Desc'].values

# Split the data for regular features
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=21)

# Initialize OneHotEncoder
binary_encoder = OneHotEncoder(categories='auto')

# Encode the features for one-hot encoding
encoded_X = binary_encoder.fit_transform(x)

# Split the encoded features for one-hot encoding
X_train_OH, X_test_OH, y_train_OH, y_test_OH = train_test_split(encoded_X, y, test_size=0.25, random_state=21)

#### Random Forest

In [None]:

classifier = RandomForestClassifier(n_estimators = 100 , criterion = 'entropy', random_state = 42)

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
unique_classes = np.unique(np.concatenate((y_test, y_pred)))
target_names = [definition_list_MCI[i] for i in unique_classes]

print("Accuracy of Random Forest : ",accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

#### Decision Tree

In [None]:
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
unique_classes = np.unique(np.concatenate((y_test, y_pred)))
target_names = [definition_list_MCI[i] for i in unique_classes]

print("Accuracy of Decision Tree : ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

#### SVM

In [25]:
classifier = SVC(kernel='rbf', random_state=42)  # You can choose different kernels here
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
unique_classes = np.unique(np.concatenate((y_test, y_pred)))
target_names = [definition_list_MCI[i] for i in unique_classes]

print("Accuracy of SVM : ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
classifier = SVC(kernel='sigmoid', random_state=42)  # You can choose different kernels here
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
unique_classes = np.unique(np.concatenate((y_test, y_pred)))
target_names = [definition_list_MCI[i] for i in unique_classes]

print("Accuracy of SVM : ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# y_pred = classifier.predict(X_test)

In [None]:
# y_pred = classifier.predict(np.array([1200, 2,]).reshape(1, -1))
# print(y_pred)

In [None]:
# print(X_test)

In [None]:
# actual_crime_labels = []

# # Loop through each actual crime code and map it to the corresponding label
# for actual_crime_code in y_test:
#     actual_crime_label = definition_list_MCI[actual_crime_code]
#     actual_crime_labels.append(actual_crime_label)

# # Create a list to store the predicted crime labels
# predicted_crime_labels = []

# # Loop through each predicted crime code and map it to the corresponding label
# for predicted_crime_code in y_pred:
#     predicted_crime_label = definition_list_MCI[predicted_crime_code]
#     predicted_crime_labels.append(predicted_crime_label)

# # Create a list to store the original labels of the time
# original_time_labels = []

# # Loop through each time value in X_test and map it to the corresponding label
# for time_value in X_test[:, 0]:
#     original_time_label = definition_list_hour[time_value]
#     original_time_labels.append(original_time_label)

# # Create a list to store the original labels of the area
# original_area_labels = []

# # Loop through each area value in X_test and map it to the corresponding label
# for area_value in X_test[:, 1]:
#     original_area_label = definition_list_division[area_value]
#     original_area_labels.append(original_area_label)

# # Create a list to store the original labels of the day
# original_day_labels = []

# # Loop through each day value in X_test and map it to the corresponding label
# for day_value in X_test[:, 2]:
#     original_day_label = definition_list_day[day_value]
#     original_day_labels.append(original_day_label)

# # Create the testing_results DataFrame with 'Time', 'Actual Crime', 'Predicted Crime', 'Predicted Crime Label', and 'Original Time Label' columns
# testing_results = pd.DataFrame({'Time': original_time_labels,  # Use original time labels
#                                 'Area': original_area_labels,
#                                 'Week Day': original_day_labels,
#                                 'Actual Crime Label': actual_crime_labels,
#                                 'Predicted Crime Label': predicted_crime_labels})  # Add the predicted crime labels

# # Display the testing results DataFrame
# testing_results

In [None]:
# predicted_crime = classifier.predict(np.array(500).reshape(1, -1))

# print('predicted_crime: ', predicted_crime)

# predicted_crime_label = definition_list_MCI[predicted_crime]
# print(predicted_crime_label)

In [None]:
# classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
# classifier.fit(X_train_OH, y_train_OH)
# y_pred_OH = classifier.predict(X_test_OH)

# print("Accuracy of Random Forest with OneHotEncoder : ",accuracy_score(y_test, y_pred))
# print(confusion_matrix(y_test_OH, y_pred_OH))
# print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI))