# Setup

In [149]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras import datasets, layers, models
from keras.utils import np_utils
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")


# Prepare the data

In [135]:

# At first we mount the dataset from local file
inputdf = pd.read_csv('Crime.csv')
# sneak peak in the data
inputdf.head(2)

Unnamed: 0,Incident ID,Offence Code,CR Number,Dispatch Date / Time,NIBRS Code,Victims,Crime Name1,Crime Name2,Crime Name3,Police District Name,...,Street Prefix,Street Name,Street Suffix,Street Type,Start_Date_Time,End_Date_Time,Latitude,Longitude,Police District Number,Location
0,201219928,1204,180063894,12/21/2018 09:13:13 PM,120,1,Crime Against Property,Robbery,ROBBERY - STREET-GUN,WHEATON,...,,GEORGIA,,AVE,12/21/2018 09:13:00 PM,,39.03627,-77.0499,4D,"(39.0363, -77.0499)"
1,201301798,5016,200035833,09/13/2020 12:26:58 AM,90Z,1,Crime Against Society,All Other Offenses,OBSTRUCT GOVT - VIOLATION OF A COURT ORDER,GERMANTOWN,...,,COLTRANE,,DR,08/08/2020 05:10:00 PM,08/08/2020 12:00:00 AM,39.27784,-77.2115,5D,"(39.2778, -77.2115)"


Analyzing the dataset, we observe that columns -


* Incident ID
*	Offence Code
* CR Number
* NIBRS Code
* State (All should be MD)
* Sector, Beat, PRA - meaningless
* Address Number (included in "Block Address")
* Street Prefix (lots of missing values)
* Street Suffix (lots of missing values)
* Police District Number (meaningless)
* Location (pair of Latitude and Longitude)

 are unnecessary. So they are removed.

In [136]:
# dropping the unnecessary columns
#inputdf = data
unnecessary_columns = ['Incident ID', 'Offence Code', 'CR Number', 'NIBRS Code', 'State', 'Sector', 'Beat', 'PRA', 'Address Number', 'Street Prefix', 'Street Suffix', 'Police District Number', 'Location'  ]
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

Unnamed: 0,Dispatch Date / Time,Victims,Crime Name1,Crime Name2,Crime Name3,Police District Name,Block Address,City,Zip Code,Agency,Place,Street Name,Street Type,Start_Date_Time,End_Date_Time,Latitude,Longitude
0,12/21/2018 09:13:13 PM,1,Crime Against Property,Robbery,ROBBERY - STREET-GUN,WHEATON,11100 BLK GEORGIA AVE,SILVER SPRING,20902.0,MCPD,Street - Bus Stop,GEORGIA,AVE,12/21/2018 09:13:00 PM,,39.03627,-77.0499
1,09/13/2020 12:26:58 AM,1,Crime Against Society,All Other Offenses,OBSTRUCT GOVT - VIOLATION OF A COURT ORDER,GERMANTOWN,25600 BLK COLTRANE DR,DAMASCUS,20872.0,MCPD,Parking Lot - Residential,COLTRANE,DR,08/08/2020 05:10:00 PM,08/08/2020 12:00:00 AM,39.27784,-77.2115


# Later, Columns related with the effect of crime are removed.

In [137]:
unnecessary_columns = ['Dispatch Date / Time', 'Victims', 'Block Address', 'Street Type', 'Agency', 'End_Date_Time']
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

Unnamed: 0,Crime Name1,Crime Name2,Crime Name3,Police District Name,City,Zip Code,Place,Street Name,Start_Date_Time,Latitude,Longitude
0,Crime Against Property,Robbery,ROBBERY - STREET-GUN,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,12/21/2018 09:13:00 PM,39.03627,-77.0499
1,Crime Against Society,All Other Offenses,OBSTRUCT GOVT - VIOLATION OF A COURT ORDER,GERMANTOWN,DAMASCUS,20872.0,Parking Lot - Residential,COLTRANE,08/08/2020 05:10:00 PM,39.27784,-77.2115


Some columns are confusing to be added now. Such as -

* Crime Name3
* Also we have to take special care of Latitude and Longitude if we want them to include in our feature set.

# Drop rows from the dataframe based on certain condition applied on a column


In [138]:
# Filter all rows for Latitude is greater than or equal to 35
inputdf = inputdf[inputdf['Latitude'] > 35.0]
# inputdf['Latitude'].hist()
# Filter all rows for Longitude is less than or equal to -70
inputdf = inputdf[inputdf['Longitude'] < -70]
# inputdf['Longitude'].hist()



# inputdf['Crime Name2'].value_counts()
# # Filter all rows for that has occurances less than 5
# inputdf = inputdf[inputdf.columns[inputdf['Crime Name2'].value_counts() > 5]]

# Creating The Location Based Crime Data Frame

In [139]:
# loc_crime_df = inputdf[['Latitude', 'Longitude', 'Crime Name1']]
loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Zip Code', 'Place', 'Street Name', 'Latitude', 'Longitude']]
dates = pd.to_datetime(inputdf['Start_Date_Time'])
  
# extract Hours from Timestamp 
# rs = dates.dt.hour
# print(rs)
loc_crime_df['dateHour'] = dates.dt.hour
loc_crime_df['day'] = dates.dt.day
loc_crime_df['month'] = dates.dt.month
loc_crime_df.head()
# print(loc_crime_df.shape)
# loc_crime_df.dtypes


Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,Crime Against Property,Robbery,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,39.03627,-77.0499,21,21,12
1,Crime Against Society,All Other Offenses,GERMANTOWN,DAMASCUS,20872.0,Parking Lot - Residential,COLTRANE,39.27784,-77.2115,17,8,8
2,Crime Against Society,Driving Under the Influence,ROCKVILLE,ROCKVILLE,20850.0,Street - In vehicle,GRANDIN,39.086688,-77.144754,2,3,1
3,Other,All Other Offenses,ROCKVILLE,ROCKVILLE,20850.0,Street - Other,GRANDIN,39.086688,-77.144754,2,3,1
4,Crime Against Property,Shoplifting,GERMANTOWN,GERMANTOWN,20876.0,Retail - Department/Discount Store,FREDERICK,39.198295,-77.2449,17,16,7


In [140]:
# Removing Rows on Count condition
counts = loc_crime_df['Crime Name2'].value_counts()
# print(counts)
threshold = 100
loc_crime_df = loc_crime_df.loc[loc_crime_df['Crime Name2'].isin(counts.index[counts >= threshold ])]

In [141]:
# Removing Rows on Count condition 
# Pruning garbage data
# loc_crime_df = loc_crime_df[~loc_crime_df['Crime Name2'] == "All Other Offenses"]
indexAge = loc_crime_df[ loc_crime_df['Crime Name2'] == "All Other Offenses" ].index
loc_crime_df.drop(indexAge , inplace=True)
loc_crime_df.head()
counts = loc_crime_df['Crime Name2'].value_counts()
# print(counts)


# Taking only Top n classes in a column


In [142]:
target_column = 'Crime Name1'
top_n = 2
top_classes = {}
top_classes = loc_crime_df[target_column].value_counts().nlargest(top_n).to_dict()
print(top_classes)
str_array = []
temp_df = pd.DataFrame()
for x in top_classes.keys():
    str_array.append(x)
    rows = loc_crime_df[loc_crime_df[target_column] == str (x) ]
    # print(rows)
    temp_df = temp_df.append(rows)
# print(str_array)
# temp_df

{'Crime Against Property': 140065, 'Crime Against Society': 57349}


In [143]:
loc_crime_df = temp_df
print(loc_crime_df[target_column].value_counts())

Crime Against Property    140065
Crime Against Society      57349
Name: Crime Name1, dtype: int64


# Dropping null values in Zip code

In [144]:
loc_crime_df['Zip Code'].value_counts()
# loc_crime_df['dateHour'].value_counts()
loc_crime_df = loc_crime_df.dropna(axis=0, subset=['Zip Code'])
print(loc_crime_df.shape)

(195178, 12)


In [145]:
loc_crime_df.head()

Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,Crime Against Property,Robbery,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,39.03627,-77.0499,21,21,12
4,Crime Against Property,Shoplifting,GERMANTOWN,GERMANTOWN,20876.0,Retail - Department/Discount Store,FREDERICK,39.198295,-77.2449,17,16,7
10,Crime Against Property,Burglary/Breaking and Entering,BETHESDA,BETHESDA,20816.0,Retail - Drug Store/Pharmacy,MAC ARTHUR,38.96562,-77.139,3,21,8
13,Crime Against Property,Shoplifting,BETHESDA,BETHESDA,20817.0,Retail - Mall,DEMOCRACY,39.022077,-77.147376,16,7,2
17,Crime Against Property,Shoplifting,ROCKVILLE,ROCKVILLE,20850.0,Retail - Department/Discount Store,HUNGERFORD,39.092059,-77.153305,18,10,9


# Transform the data by label encoding

In [151]:
# encode class values as integers
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform( loc_crime_df[target_column])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [152]:
encoded_dict ={}
temp_df_2 = pd.DataFrame()

def label_encoder(y):
    le = LabelEncoder()
    if str(y) == target_column:
        print(loc_crime_df[y])
    loc_crime_df[y] = le.fit_transform(loc_crime_df[y])
    if str(y) == target_column:
        print(loc_crime_df[y])
    if str(y) == target_column:
        temp_df_2[y] = le.inverse_transform(loc_crime_df[y])
        # encoded_dict = {loc_crime_df[y], temp_df_2[y]} 
    #print(temp_df[y])

#loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']]

label_list = ['Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']


# label_list = ['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']
#label_list = ['Crime Name1']

for l in label_list:
    label_encoder(l)

# print(encoded_dict)
 
#Display transformed data
loc_crime_df.head()

Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,Crime Against Property,25,8,34,20902.0,90,2671,39.03627,-77.0499,21,21,12
4,Crime Against Property,26,2,19,20876.0,75,2547,39.198295,-77.2449,17,16,7
10,Crime Against Property,3,0,5,20816.0,76,4080,38.96562,-77.139,3,21,8
13,Crime Against Property,26,0,5,20817.0,80,1891,39.022077,-77.147376,16,7,2
17,Crime Against Property,26,5,32,20850.0,75,3365,39.092059,-77.153305,18,10,9


In [156]:
# Going for specigic prediction without geolocation
# X = loc_crime_df.drop([target_column],axis=1)
# y = loc_crime_df[target_column]

# Going for specigic prediction without geolocation
# X = loc_crime_df.drop([target_column, 'Latitude', 'Longitude'],axis=1)
# y = loc_crime_df[target_column]


# Going for generic prediction
X = loc_crime_df.drop([target_column,'Crime Name2', 'Latitude', 'Longitude'],axis=1)
# y = loc_crime_df[[target_column, 'Latitude', 'Longitude']]
y = loc_crime_df[['Latitude', 'Longitude']], dummy_y


print(X)
print(y)

        Police District Name  City  Zip Code  Place  Street Name  dateHour  \
0                          8    34   20902.0     90         2671        21   
4                          2    19   20876.0     75         2547        17   
10                         0     5   20816.0     76         4080         3   
13                         0     5   20817.0     80         1891        16   
17                         5    32   20850.0     75         3365        18   
...                      ...   ...       ...    ...          ...       ...   
312177                     3    17   20877.0     91         2547         2   
312211                     2    19   20876.0     27         4798         1   
312229                     2    17   20882.0     92         7271        10   
312260                     8    34   20906.0     32         3830        16   
312287                     7    36   20912.0     94         3974         9   

        day  month  
0        21     12  
4        16      7  


# Split the data into training and testing set


In [154]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
                                               random_state=42, shuffle = True) 

#Data was splitted as 80% train data and 20% test data.

# y_train = y_train.values.reshape(-1,1)
# y_test = y_test.values.reshape(-1,1)

print("X_train shape:",X_train.shape)
print("X_test shape:",X_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

X_train shape: (156142, 8)
X_test shape: (39036, 8)
y_train shape: (156142, 3)
y_test shape: (39036, 3)


In [155]:

# Md Hasan Shahriar
# Tue, Dec 6, 6:51 PM 


feature_number = 8
output_dim = top_n+2


model = keras.Sequential(
    [
        keras.Input(shape=feature_number),
        layers.Dense(100, activation="relu"),
        layers.Dense(50, activation="relu"),
        layers.Dense(output_dim, activation="softmax"),
    ]
)

model.summary()


batch_size = 128
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 100)               900       
                                                                 
 dense_16 (Dense)            (None, 50)                5050      
                                                                 
 dense_17 (Dense)            (None, 3)                 153       
                                                                 
Total params: 6,103
Trainable params: 6,103
Non-trainable params: 0
_________________________________________________________________


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

# Evaluate the trained model


In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
# # Model / data parameters
# num_classes = 10
# input_shape = (28, 28, 1)



# # Load the data and split it between train and test sets
# (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# # Scale images to the [0, 1] range
# x_train = x_train.astype("float32") / 255
# x_test = x_test.astype("float32") / 255
# # Make sure images have shape (28, 28, 1)
# x_train = np.expand_dims(x_train, -1)
# x_test = np.expand_dims(x_test, -1)
# print("x_train shape:", x_train.shape)
# print(x_train.shape[0], "train samples")
# print(x_test.shape[0], "test samples")


# # convert class vectors to binary class matrices
# y_train = keras.utils.to_categorical(y_train, num_classes)
# y_test = keras.utils.to_categorical(y_test, num_classes)

In [None]:
print(inputdf.shape)
# 'column names are:' 
inputdf.columns
inputdf.dtypes

# Build the model


In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

# Train the model


In [None]:
batch_size = 128
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)