# Setup

In [77]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras import datasets, layers, models
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")


# Prepare the data

In [78]:

# At first we mount the dataset from local file
inputdf = pd.read_csv('Crime.csv')
# sneak peak in the data
inputdf.head(2)

Unnamed: 0,Incident ID,Offence Code,CR Number,Dispatch Date / Time,NIBRS Code,Victims,Crime Name1,Crime Name2,Crime Name3,Police District Name,...,Street Prefix,Street Name,Street Suffix,Street Type,Start_Date_Time,End_Date_Time,Latitude,Longitude,Police District Number,Location
0,201219928,1204,180063894,12/21/2018 09:13:13 PM,120,1,Crime Against Property,Robbery,ROBBERY - STREET-GUN,WHEATON,...,,GEORGIA,,AVE,12/21/2018 09:13:00 PM,,39.03627,-77.0499,4D,"(39.0363, -77.0499)"
1,201301798,5016,200035833,09/13/2020 12:26:58 AM,90Z,1,Crime Against Society,All Other Offenses,OBSTRUCT GOVT - VIOLATION OF A COURT ORDER,GERMANTOWN,...,,COLTRANE,,DR,08/08/2020 05:10:00 PM,08/08/2020 12:00:00 AM,39.27784,-77.2115,5D,"(39.2778, -77.2115)"


Analyzing the dataset, we observe that columns -


* Incident ID
*	Offence Code
* CR Number
* NIBRS Code
* State (All should be MD)
* Sector, Beat, PRA - meaningless
* Address Number (included in "Block Address")
* Street Prefix (lots of missing values)
* Street Suffix (lots of missing values)
* Police District Number (meaningless)
* Location (pair of Latitude and Longitude)

 are unnecessary. So they are removed.

In [79]:
# dropping the unnecessary columns
#inputdf = data
unnecessary_columns = ['Incident ID', 'Offence Code', 'CR Number', 'NIBRS Code', 'State', 'Sector', 'Beat', 'PRA', 'Address Number', 'Street Prefix', 'Street Suffix', 'Police District Number', 'Location'  ]
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

Unnamed: 0,Dispatch Date / Time,Victims,Crime Name1,Crime Name2,Crime Name3,Police District Name,Block Address,City,Zip Code,Agency,Place,Street Name,Street Type,Start_Date_Time,End_Date_Time,Latitude,Longitude
0,12/21/2018 09:13:13 PM,1,Crime Against Property,Robbery,ROBBERY - STREET-GUN,WHEATON,11100 BLK GEORGIA AVE,SILVER SPRING,20902.0,MCPD,Street - Bus Stop,GEORGIA,AVE,12/21/2018 09:13:00 PM,,39.03627,-77.0499
1,09/13/2020 12:26:58 AM,1,Crime Against Society,All Other Offenses,OBSTRUCT GOVT - VIOLATION OF A COURT ORDER,GERMANTOWN,25600 BLK COLTRANE DR,DAMASCUS,20872.0,MCPD,Parking Lot - Residential,COLTRANE,DR,08/08/2020 05:10:00 PM,08/08/2020 12:00:00 AM,39.27784,-77.2115


# Later, Columns related with the effect of crime are removed.

In [80]:
unnecessary_columns = ['Dispatch Date / Time', 'Victims', 'Block Address', 'Street Type', 'Agency', 'End_Date_Time']
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

Unnamed: 0,Crime Name1,Crime Name2,Crime Name3,Police District Name,City,Zip Code,Place,Street Name,Start_Date_Time,Latitude,Longitude
0,Crime Against Property,Robbery,ROBBERY - STREET-GUN,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,12/21/2018 09:13:00 PM,39.03627,-77.0499
1,Crime Against Society,All Other Offenses,OBSTRUCT GOVT - VIOLATION OF A COURT ORDER,GERMANTOWN,DAMASCUS,20872.0,Parking Lot - Residential,COLTRANE,08/08/2020 05:10:00 PM,39.27784,-77.2115


Some columns are confusing to be added now. Such as -

* Crime Name3
* Also we have to take special care of Latitude and Longitude if we want them to include in our feature set.

# Drop rows from the dataframe based on certain condition applied on a column


In [81]:
# Filter all rows for Latitude is greater than or equal to 35
inputdf = inputdf[inputdf['Latitude'] > 35.0]
# inputdf['Latitude'].hist()
# Filter all rows for Longitude is less than or equal to -70
inputdf = inputdf[inputdf['Longitude'] < -70]
# inputdf['Longitude'].hist()



# inputdf['Crime Name2'].value_counts()
# # Filter all rows for that has occurances less than 5
# inputdf = inputdf[inputdf.columns[inputdf['Crime Name2'].value_counts() > 5]]

# Creating The Location Based Crime Data Frame

In [82]:
# loc_crime_df = inputdf[['Latitude', 'Longitude', 'Crime Name1']]
loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Zip Code', 'Place', 'Street Name', 'Latitude', 'Longitude']]
dates = pd.to_datetime(inputdf['Start_Date_Time'])
  
# extract Hours from Timestamp 
# rs = dates.dt.hour
# print(rs)
loc_crime_df['dateHour'] = dates.dt.hour
loc_crime_df['day'] = dates.dt.day
loc_crime_df['month'] = dates.dt.month
loc_crime_df.head()
# print(loc_crime_df.shape)
# loc_crime_df.dtypes


Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,Crime Against Property,Robbery,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,39.03627,-77.0499,21,21,12
1,Crime Against Society,All Other Offenses,GERMANTOWN,DAMASCUS,20872.0,Parking Lot - Residential,COLTRANE,39.27784,-77.2115,17,8,8
2,Crime Against Society,Driving Under the Influence,ROCKVILLE,ROCKVILLE,20850.0,Street - In vehicle,GRANDIN,39.086688,-77.144754,2,3,1
3,Other,All Other Offenses,ROCKVILLE,ROCKVILLE,20850.0,Street - Other,GRANDIN,39.086688,-77.144754,2,3,1
4,Crime Against Property,Shoplifting,GERMANTOWN,GERMANTOWN,20876.0,Retail - Department/Discount Store,FREDERICK,39.198295,-77.2449,17,16,7


In [83]:
# Removing Rows on Count condition
counts = loc_crime_df['Crime Name2'].value_counts()
# print(counts)
threshold = 100
loc_crime_df = loc_crime_df.loc[loc_crime_df['Crime Name2'].isin(counts.index[counts >= threshold ])]

In [84]:
# Removing Rows on Count condition 
# Pruning garbage data
# loc_crime_df = loc_crime_df[~loc_crime_df['Crime Name2'] == "All Other Offenses"]
indexAge = loc_crime_df[ loc_crime_df['Crime Name2'] == "All Other Offenses" ].index
loc_crime_df.drop(indexAge , inplace=True)
loc_crime_df.head()
counts = loc_crime_df['Crime Name2'].value_counts()
print(counts)


Theft From Motor Vehicle                       26252
Drug/Narcotic Violations                       22573
Simple Assault                                 21064
Destruction/Damage/Vandalism of Property       18967
Shoplifting                                    16416
All other Larceny                              13972
Driving Under the Influence                    13723
Theft from Building                            11335
Burglary/Breaking and Entering                  8784
Identity Theft                                  8633
Theft of Motor Vehicle Parts or Accessories     6747
Liquor Law Violations                           6565
Motor Vehicle Theft                             6489
False Pretenses/Swindle/Confidence Game         6438
Disorderly Conduct                              4980
Credit Card/Automatic Teller Machine Fraud      4726
Aggravated Assault                              4313
Trespass of Real Property                       3939
Runaway                                       

# Taking only Top n classes in a column


In [85]:
target_column = 'Crime Name1'
top_n = 2
top_classes = {}
top_classes = loc_crime_df[target_column].value_counts().nlargest(top_n).to_dict()
print(top_classes)
str_array = []
temp_df = pd.DataFrame()
for x in top_classes.keys():
    str_array.append(x)
    rows = loc_crime_df[loc_crime_df[target_column] == str (x) ]
    # print(rows)
    temp_df = temp_df.append(rows)
print(str_array)
temp_df

{'Crime Against Property': 140065, 'Crime Against Society': 57349}
['Crime Against Property', 'Crime Against Society']


Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,Crime Against Property,Robbery,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,39.036270,-77.049900,21,21,12
4,Crime Against Property,Shoplifting,GERMANTOWN,GERMANTOWN,20876.0,Retail - Department/Discount Store,FREDERICK,39.198295,-77.244900,17,16,7
10,Crime Against Property,Burglary/Breaking and Entering,BETHESDA,BETHESDA,20816.0,Retail - Drug Store/Pharmacy,MAC ARTHUR,38.965620,-77.139000,3,21,8
13,Crime Against Property,Shoplifting,BETHESDA,BETHESDA,20817.0,Retail - Mall,DEMOCRACY,39.022077,-77.147376,16,7,2
17,Crime Against Property,Shoplifting,ROCKVILLE,ROCKVILLE,20850.0,Retail - Department/Discount Store,HUNGERFORD,39.092059,-77.153305,18,10,9
...,...,...,...,...,...,...,...,...,...,...,...,...
312177,Crime Against Society,Driving Under the Influence,MONTGOMERY VILLAGE,GAITHERSBURG,20877.0,Street - Commercial,FREDERICK,39.144990,-77.205500,2,28,8
312211,Crime Against Society,Trespass of Real Property,GERMANTOWN,GERMANTOWN,20876.0,Hospital/Emergency Care Center,OBSERVATION,39.180560,-77.243900,1,30,8
312229,Crime Against Society,Driving Under the Influence,GERMANTOWN,GAITHERSBURG,20882.0,Street - In vehicle,WOODFIELD,39.246760,-77.191000,10,29,8
312260,Crime Against Society,Liquor Law Violations,WHEATON,SILVER SPRING,20906.0,Laundromat,LAYHILL,39.061240,-77.051200,16,30,8


In [86]:
loc_crime_df = temp_df
print(loc_crime_df[target_column].value_counts())

Crime Against Property    140065
Crime Against Society      57349
Name: Crime Name1, dtype: int64


# Dropping null values in Zip code

In [87]:
loc_crime_df['Zip Code'].value_counts()
# loc_crime_df['dateHour'].value_counts()
loc_crime_df = loc_crime_df.dropna(axis=0, subset=['Zip Code'])
print(loc_crime_df.shape)

(195178, 12)


In [88]:
loc_crime_df.head()

Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,Crime Against Property,Robbery,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,39.03627,-77.0499,21,21,12
4,Crime Against Property,Shoplifting,GERMANTOWN,GERMANTOWN,20876.0,Retail - Department/Discount Store,FREDERICK,39.198295,-77.2449,17,16,7
10,Crime Against Property,Burglary/Breaking and Entering,BETHESDA,BETHESDA,20816.0,Retail - Drug Store/Pharmacy,MAC ARTHUR,38.96562,-77.139,3,21,8
13,Crime Against Property,Shoplifting,BETHESDA,BETHESDA,20817.0,Retail - Mall,DEMOCRACY,39.022077,-77.147376,16,7,2
17,Crime Against Property,Shoplifting,ROCKVILLE,ROCKVILLE,20850.0,Retail - Department/Discount Store,HUNGERFORD,39.092059,-77.153305,18,10,9


# Transform the data by label encoding

In [89]:
encoded_dict ={}
temp_df_2 = pd.DataFrame()

def label_encoder(y):
    le = LabelEncoder()
    if str(y) == target_column:
        print(loc_crime_df[y])
    loc_crime_df[y] = le.fit_transform(loc_crime_df[y])
    if str(y) == target_column:
        print(loc_crime_df[y])
    if str(y) == target_column:
        temp_df_2[y] = le.inverse_transform(loc_crime_df[y])
        # encoded_dict = {loc_crime_df[y], temp_df_2[y]} 
    #print(temp_df[y])

#loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']]

label_list = ['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']
#label_list = ['Crime Name1']

for l in label_list:
    label_encoder(l)

# print(encoded_dict)
 
#Display transformed data
loc_crime_df.head()

0         Crime Against Property
4         Crime Against Property
10        Crime Against Property
13        Crime Against Property
17        Crime Against Property
                   ...          
312177     Crime Against Society
312211     Crime Against Society
312229     Crime Against Society
312260     Crime Against Society
312287     Crime Against Society
Name: Crime Name1, Length: 195178, dtype: object
0         0
4         0
10        0
13        0
17        0
         ..
312177    1
312211    1
312229    1
312260    1
312287    1
Name: Crime Name1, Length: 195178, dtype: int32


Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,0,25,8,34,20902.0,90,2671,39.03627,-77.0499,21,21,12
4,0,26,2,19,20876.0,75,2547,39.198295,-77.2449,17,16,7
10,0,3,0,5,20816.0,76,4080,38.96562,-77.139,3,21,8
13,0,26,0,5,20817.0,80,1891,39.022077,-77.147376,16,7,2
17,0,26,5,32,20850.0,75,3365,39.092059,-77.153305,18,10,9


In [90]:
# Going for specigic prediction without geolocation
# X = loc_crime_df.drop([target_column],axis=1)
# y = loc_crime_df[target_column]

# Going for specigic prediction without geolocation
# X = loc_crime_df.drop([target_column, 'Latitude', 'Longitude'],axis=1)
# y = loc_crime_df[target_column]


# Going for generic prediction
X = loc_crime_df.drop([target_column,'Crime Name2', 'Latitude', 'Longitude'],axis=1)
y = loc_crime_df[[target_column, 'Latitude', 'Longitude']]

print(X)
print(y)

        Police District Name  City  Zip Code  Place  Street Name  dateHour  \
0                          8    34   20902.0     90         2671        21   
4                          2    19   20876.0     75         2547        17   
10                         0     5   20816.0     76         4080         3   
13                         0     5   20817.0     80         1891        16   
17                         5    32   20850.0     75         3365        18   
...                      ...   ...       ...    ...          ...       ...   
312177                     3    17   20877.0     91         2547         2   
312211                     2    19   20876.0     27         4798         1   
312229                     2    17   20882.0     92         7271        10   
312260                     8    34   20906.0     32         3830        16   
312287                     7    36   20912.0     94         3974         9   

        day  month  
0        21     12  
4        16      7  


# Split the data into training and testing set


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
                                               random_state=42, shuffle = True) 

#Data was splitted as 80% train data and 20% test data.

# y_train = y_train.values.reshape(-1,1)
# y_test = y_test.values.reshape(-1,1)

print("X_train shape:",X_train.shape)
print("X_test shape:",X_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

In [91]:

# Md Hasan Shahriar
# Tue, Dec 6, 6:51 PM 


x_train = np.random.rand(10000, 46)
y_train = np.random.rand(10000, 10)


model = keras.Sequential(
    [
        keras.Input(shape=46),
        layers.Dense(100, activation="relu"),
        layers.Dense(50, activation="relu"),
        layers.Dense(10, activation="softmax"),
    ]
)

model.summary()


batch_size = 128
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 100)               4700      
                                                                 
 dense_4 (Dense)             (None, 50)                5050      
                                                                 
 dense_5 (Dense)             (None, 10)                510       
                                                                 
Total params: 10,260
Trainable params: 10,260
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1fe3768b6d0>

In [92]:
# # Model / data parameters
# num_classes = 10
# input_shape = (28, 28, 1)



# # Load the data and split it between train and test sets
# (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# # Scale images to the [0, 1] range
# x_train = x_train.astype("float32") / 255
# x_test = x_test.astype("float32") / 255
# # Make sure images have shape (28, 28, 1)
# x_train = np.expand_dims(x_train, -1)
# x_test = np.expand_dims(x_test, -1)
# print("x_train shape:", x_train.shape)
# print(x_train.shape[0], "train samples")
# print(x_test.shape[0], "test samples")


# # convert class vectors to binary class matrices
# y_train = keras.utils.to_categorical(y_train, num_classes)
# y_test = keras.utils.to_categorical(y_test, num_classes)

In [93]:
print(inputdf.shape)
# 'column names are:' 
inputdf.columns
inputdf.dtypes

(304175, 11)


Crime Name1              object
Crime Name2              object
Crime Name3              object
Police District Name     object
City                     object
Zip Code                float64
Place                    object
Street Name              object
Start_Date_Time          object
Latitude                float64
Longitude               float64
dtype: object

# Build the model


In [94]:
model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

NameError: name 'input_shape' is not defined

# Train the model


In [None]:
batch_size = 128
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

# Evaluate the trained model


In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

# Later, Columns related with the effect of crime are removed.

In [None]:
unnecessary_columns = ['Dispatch Date / Time', 'Victims', 'Block Address', 'Street Type', 'Agency', 'End_Date_Time']
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

Some columns are confusing to be added now. Such as -

* Crime Name3
* Zip Code
* Street Name
* Also we have to take special care of Latitude and Longitude if we want them to include in our feature set.

# Now we have to visualize the data and fix what to predict from these modified feature set

In [None]:
# Montgomery County maintained roads
# https://data.imap.maryland.gov/datasets/maryland::montgomery-county-maintained-roads/explore?location=39.168559%2C-76.905435%2C9.35
# https://catalog.data.gov/dataset/tiger-line-shapefile-2018-county-montgomery-county-md-all-roads-county-based-shapefile
# crs = {'init':'EPSG:4326'}
montgomery_county_map = gpd.read_file('Zipcodes.shp')
print(montgomery_county_map.crs) 
montgomery_county_map.to_crs(epsg=4326).plot()

# Other sources for multiple data view
# https://montgomeryplanning.org/tools/gis-and-mapping/map-library/

# Drop rows from the dataframe based on certain condition applied on a column


In [None]:
# Filter all rows for Latitude is greater than or equal to 35
inputdf = inputdf[inputdf['Latitude'] > 35.0]
# inputdf['Latitude'].hist()

In [None]:
# Filter all rows for Longitude is less than or equal to -70
inputdf = inputdf[inputdf['Longitude'] < -70]
# inputdf['Longitude'].hist()

In [None]:
# inputdf['Crime Name2'].value_counts()
# # Filter all rows for that has occurances less than 5
# inputdf = inputdf[inputdf.columns[inputdf['Crime Name2'].value_counts() > 5]]

In [None]:
inputdf['Crime Name2'].value_counts()

# Export Pandas DataFrame to CSV


In [None]:

# inputdf.to_csv('potential feature set revised.csv')


# ‘geo_df’ that is a copy of our original data frame but with the newly created ‘geometry’ column.


In [None]:
crs = {'init':'EPSG:4326'}
geometry = [Point(xy) for xy in zip(inputdf['Longitude'], inputdf['Latitude'])]
geo_df = gpd.GeoDataFrame(inputdf, 
                          crs = crs, 
                          geometry = geometry)

geo_df.head()

In [None]:
geo_df.plot()

# Visualizing the Crime Data

In [None]:
fig, ax = plt.subplots(figsize = (20,20))
montgomery_county_map.to_crs(epsg=4326).plot(ax=ax, color='lightgrey')
geo_df.plot(column = geo_df['Crime Name1'], ax=ax, cmap = 'rainbow',
            legend = True, #legend_kwds={'shrink': 0.3}, 
            markersize = 10)
ax.set_title('Montgomery County Crime type data Heatmap')
# plt.savefig('Crime Heat Map for Location')

In [None]:
#geo_df['Crime Name1'].hist()
geo_df['Crime Name2'].count()
geo_df['Crime Name2'].value_counts()
# geo_df.groupby('Crime Name2').count() # per group data count


#geo_df['Latitude'].hist()

# Separating Train and Test Set for Crime Prediction for Location

In [None]:
print(inputdf['Crime Name1'].value_counts())

# Creating The Location Based Crime Data Frame

In [None]:
# loc_crime_df = inputdf[['Latitude', 'Longitude', 'Crime Name1']]
loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Zip Code', 'Place', 'Street Name', 'Latitude', 'Longitude']]
dates = pd.to_datetime(inputdf['Start_Date_Time'])
  
# extract Hours from Timestamp 
# rs = dates.dt.hour
# print(rs)
loc_crime_df['dateHour'] = dates.dt.hour
loc_crime_df['day'] = dates.dt.day
loc_crime_df['month'] = dates.dt.month
loc_crime_df.head()
# print(loc_crime_df.shape)
# loc_crime_df.dtypes


In [None]:
 # Removing Rows on Count condition
counts = loc_crime_df['Crime Name2'].value_counts()
# print(counts)
threshold = 100
loc_crime_df = loc_crime_df.loc[loc_crime_df['Crime Name2'].isin(counts.index[counts >= threshold ])]



In [None]:
 # Removing Rows on Count condition 
 # Pruning garbage data
# loc_crime_df = loc_crime_df[~loc_crime_df['Crime Name2'] == "All Other Offenses"]
indexAge = loc_crime_df[ loc_crime_df['Crime Name2'] == "All Other Offenses" ].index
loc_crime_df.drop(indexAge , inplace=True)
loc_crime_df.head()
counts = loc_crime_df['Crime Name2'].value_counts()
# print(counts)


# Taking only Top n classes in a column


In [None]:
target_column = 'Crime Name2'
top_n = 5
top_classes = {}
top_classes = loc_crime_df[target_column].value_counts().nlargest(top_n).to_dict()
print(top_classes)
str_array = []
temp_df = pd.DataFrame()
for x in top_classes.keys():
    str_array.append(x)
    rows = loc_crime_df[loc_crime_df[target_column] == str (x) ]
    # print(rows)
    temp_df = temp_df.append(rows)
# print(str_array)
# temp_df

In [None]:
# print(temp_df[target_column].value_counts())
loc_crime_df = temp_df
print(loc_crime_df[target_column].value_counts())

# Dropping null values in Zip code

In [None]:
loc_crime_df['Zip Code'].value_counts()
# loc_crime_df['dateHour'].value_counts()

In [None]:
loc_crime_df = loc_crime_df.dropna(axis=0, subset=['Zip Code'])
print(loc_crime_df.shape)

In [None]:
# loc_crime_df['Crime Name2'].count()
# loc_crime_df['Crime Name2'].value_counts()

# Transform the data by label encoding

In [None]:
encoded_dict ={}
temp_df_2 = pd.DataFrame()

def label_encoder(y):
    le = LabelEncoder()
    if str(y) == target_column:
        print(loc_crime_df[y])
    loc_crime_df[y] = le.fit_transform(loc_crime_df[y])
    if str(y) == target_column:
        print(loc_crime_df[y])
    if str(y) == target_column:
        temp_df_2[y] = le.inverse_transform(loc_crime_df[y])
        # encoded_dict = {loc_crime_df[y], temp_df_2[y]} 
    #print(temp_df[y])

#loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']]

label_list = ['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']
#label_list = ['Crime Name1']

for l in label_list:
    label_encoder(l)

# print(encoded_dict)
 
#Display transformed data
loc_crime_df.head()

In [None]:
# print(loc_crime_df['Crime Name1'].value_counts())
# print(temp_df['Crime Name1'].value_counts())

# Divide the dataset into independent and dependent variables


In [None]:
# Going for specigic prediction without geolocation
X = loc_crime_df.drop([target_column],axis=1)
y = loc_crime_df[target_column]

# Going for specigic prediction without geolocation
# X = loc_crime_df.drop([target_column, 'Latitude', 'Longitude'],axis=1)
# y = loc_crime_df[target_column]


# Going for generic prediction
# X = loc_crime_df.drop([target_column,'Crime Name2', 'Latitude', 'Longitude'],axis=1)
# y = loc_crime_df[target_column]

print(X)


In [None]:
print(y.value_counts())

In [None]:
#Split the data into training and testing set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
                                               random_state=42, shuffle = True) 

#Data was splitted as 80% train data and 20% test data.

# y_train = y_train.values.reshape(-1,1)
# y_test = y_test.values.reshape(-1,1)

print("X_train shape:",X_train.shape)
print("X_test shape:",X_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

In [None]:
#Feature Scaling (Standardize the data)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# We will build all six models and compare their accuracy scores.



In [None]:
#To store results of models, we create two dictionaries
result_dict_train = {}
result_dict_test = {}

# Machine Leaning Models and their performance


In [None]:
models = {'Naïve Bayes Classifier': GaussianNB, 'Decision Tree Classifier' : DecisionTreeClassifier, 'KNN Classifier': KNeighborsClassifier,
        'Random Forest Classifier': RandomForestClassifier,'Logistic Regression': LogisticRegression}
        # , 'Support Vector Classifier': SVC}
# for keys, values in sorted(models.items()):
#     print(keys +" is "+ values)

warnings.filterwarnings("ignore")

# for name, model in models.items():
#     print(name +" is "+ model)
    

In [None]:
for model_name, model_function in models.items():
    print(model_name +' is running')
    try:
        model = model_function(random_state = 42)
    except:
        if(model_name=='Logistic Regression'):
            model = model_function(solver='lbfgs', max_iter=1000)
        model = model_function()
    accuracies = cross_val_score(model, X_train, y_train, cv=5)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    #Obtain accuracy
    print("Train Score:",np.mean(accuracies))
    print("Test Score:",model.score(X_test,y_test))
    print("Test Score (F1 - micro):",f1_score(y_test, y_pred, average='micro'))
    print("Test Score (F1 - macro):",f1_score(y_test, y_pred, average='macro'))
    print("Test Score (F1 - weighted):",f1_score(y_test, y_pred, average='weighted'))

    #Confusion matrix
    # plot_confusion_matrix(model, X_test, y_test)  
    # plt.show()

    
    # Plot non-normalized confusion matrix
    titles_options = [
        ("Normalized confusion matrix", "true"),
    ]
    for title, normalize in titles_options:
        disp = ConfusionMatrixDisplay.from_estimator(
            model,
            X_test,
            y_test,
            display_labels=str_array,
            cmap=plt.cm.Blues,
            normalize=normalize,
            xticks_rotation=90,
            #yticks_rotation=75,
        )
        disp.ax_.set_title(title)
        # ax[0].set_xticklabels(df_result_train.index,rotation = 75)
        # ax[1].set_xticklabels(df_result_test.index,rotation = 75)

        print(title)
        #print(disp.confusion_matrix)

    plt.show()


    #Store results in the dictionaries
    result_dict_train['Train Score for '+model_name] = np.mean(accuracies)
    result_dict_test['Test Score for '+model_name] = model.score(X_test,y_test)
    


In [None]:
# lengthy_models = {'Support Vector Classifier': SVC,
#         'Random Forest Classifier': RandomForestClassifier}
# for model_name, model_function in lengthy_models.items():
#     print(model_name +' is running')
#     try:
#         model = model_function(random_state = 42)
#     except:
#         model = model_function()
#     accuracies = cross_val_score(model, X_train, y_train, cv=5)
#     model.fit(X_train,y_train)
#     y_pred = model.predict(X_test)

#     #Obtain accuracy
#     print("Train Score:",np.mean(accuracies))
#     print("Test Score:",model.score(X_test,y_test))


#     # #Store results in the dictionaries
#     # result_dict_train['Train Score for '+model_name] = np.mean(accuracies)
#     # result_dict_test['Test Score for '+model_name] = model.score(X_test,y_test)

# Compare Accuracy Scores

In [None]:
df_result_train = pd.DataFrame.from_dict(result_dict_train,orient = "index", columns=["Score"])
print(df_result_train)

df_result_test = pd.DataFrame.from_dict(result_dict_test,orient = "index",columns=["Score"])
df_result_test


# Display the accuracy scores

In [None]:
import seaborn as sns

fig,ax = plt.subplots(1,2,figsize=(10,5))
sns.barplot(x = df_result_train.index,y = df_result_train.Score,ax = ax[0])
sns.barplot(x = df_result_test.index,y = df_result_test.Score,ax = ax[1])
ax[0].set_xticklabels(df_result_train.index,rotation = 75)
ax[1].set_xticklabels(df_result_test.index,rotation = 75)
#plt.show()
# plt.savefig('Predicting Specific Crime With Location')