# Setup

In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras import datasets, layers, models
from keras.utils import np_utils
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt



warnings.filterwarnings("ignore")


# Prepare the data

In [None]:

# At first we mount the dataset from local file
inputdf = pd.read_csv('Crime.csv')
# sneak peak in the data
inputdf.head(2)

Analyzing the dataset, we observe that columns -


* Incident ID
*	Offence Code
* CR Number
* NIBRS Code
* State (All should be MD)
* Sector, Beat, PRA - meaningless
* Address Number (included in "Block Address")
* Street Prefix (lots of missing values)
* Street Suffix (lots of missing values)
* Police District Number (meaningless)
* Location (pair of Latitude and Longitude)

 are unnecessary. So they are removed.

In [None]:
# dropping the unnecessary columns
#inputdf = data
unnecessary_columns = ['Incident ID', 'Offence Code', 'CR Number', 'NIBRS Code', 'State', 'Sector', 'Beat', 'PRA', 'Address Number', 'Street Prefix', 'Street Suffix', 'Police District Number', 'Location'  ]
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

# Later, Columns related with the effect of crime are removed.

In [None]:
unnecessary_columns = ['Dispatch Date / Time', 'Victims', 'Block Address', 'Street Type', 'Agency', 'End_Date_Time']
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

Some columns are confusing to be added now. Such as -

* Crime Name3
* Also we have to take special care of Latitude and Longitude if we want them to include in our feature set.

# Drop rows from the dataframe based on certain condition applied on a column


In [None]:
# Filter all rows for Latitude is greater than or equal to 35
inputdf = inputdf[inputdf['Latitude'] > 35.0]
# inputdf['Latitude'].hist()
# Filter all rows for Longitude is less than or equal to -70
inputdf = inputdf[inputdf['Longitude'] < -70]
# inputdf['Longitude'].hist()



# inputdf['Crime Name2'].value_counts()
# # Filter all rows for that has occurances less than 5
# inputdf = inputdf[inputdf.columns[inputdf['Crime Name2'].value_counts() > 5]]

# Creating The Location Based Crime Data Frame

In [None]:
# loc_crime_df = inputdf[['Latitude', 'Longitude', 'Crime Name1']]
loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Zip Code', 'Place', 'Street Name', 'Latitude', 'Longitude']]
dates = pd.to_datetime(inputdf['Start_Date_Time'])
  
# extract Hours from Timestamp 
# rs = dates.dt.hour
# print(rs)
loc_crime_df['dateHour'] = dates.dt.hour
loc_crime_df['day'] = dates.dt.day
loc_crime_df['month'] = dates.dt.month
loc_crime_df.head()
# print(loc_crime_df.shape)
# loc_crime_df.dtypes


In [None]:
# Removing Rows on Count condition
counts = loc_crime_df['Crime Name2'].value_counts()
# print(counts)
threshold = 100
loc_crime_df = loc_crime_df.loc[loc_crime_df['Crime Name2'].isin(counts.index[counts >= threshold ])]

In [None]:
# Removing Rows on Count condition 
# Pruning garbage data
# loc_crime_df = loc_crime_df[~loc_crime_df['Crime Name2'] == "All Other Offenses"]
indexAge = loc_crime_df[ loc_crime_df['Crime Name2'] == "All Other Offenses" ].index
loc_crime_df.drop(indexAge , inplace=True)
loc_crime_df.head()
counts = loc_crime_df['Crime Name2'].value_counts()
# print(counts)


# Taking only Top n classes in a column


In [None]:
# Change target column name here
target_column = 'Crime Name2'
top_n = 10
top_classes_dict = {}
top_classes_dict = loc_crime_df[target_column].value_counts().nlargest(top_n).to_dict()
top_classes_series = loc_crime_df[target_column].value_counts().nlargest(top_n)
print(top_classes_series)
print('Dictionary is + '+str(top_classes_dict))
str_array = []
temp_df = pd.DataFrame()
for x in top_classes_dict.keys():
    str_array.append(x)
    rows = loc_crime_df[loc_crime_df[target_column] == str (x) ]
    # print(rows)
    temp_df = temp_df.append(rows)
# print(str_array)
# temp_df

In [None]:
loc_crime_df = temp_df
print(loc_crime_df[target_column].value_counts())

# Dropping null values in Zip code

In [None]:
loc_crime_df['Zip Code'].value_counts()
# loc_crime_df['dateHour'].value_counts()
loc_crime_df = loc_crime_df.dropna(axis=0, subset=['Zip Code'])
print(loc_crime_df.shape)

In [None]:
loc_crime_df.head()

# Transform the data by label encoding

In [None]:
# encode class values as integers
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform( loc_crime_df[target_column])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
print(dummy_y)


In [None]:
print(dummy_y.shape)

In [None]:
encoded_dict ={}
temp_df_2 = pd.DataFrame()

def label_encoder(y):
    le = LabelEncoder()
    if str(y) == target_column:
        print(loc_crime_df[y])
    loc_crime_df[y] = le.fit_transform(loc_crime_df[y])
    if str(y) == target_column:
        print(loc_crime_df[y])
    if str(y) == target_column:
        temp_df_2[y] = le.inverse_transform(loc_crime_df[y])
        # encoded_dict = {loc_crime_df[y], temp_df_2[y]} 
    #print(temp_df[y])

#loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']]

# label_list = ['Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']

label_list = ['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']

#label_list = ['Crime Name1']

for l in label_list:
    label_encoder(l)

# print(encoded_dict)
 
#Display transformed data
loc_crime_df.head()
print(loc_crime_df.shape)

# Separating feature set and Target Class

In [None]:
# Going for specigic prediction without geolocation
# X = loc_crime_df.drop([target_column],axis=1)
# y = loc_crime_df[target_column]

# Going for specific prediction without geolocation
X = loc_crime_df.drop([target_column, 'Latitude', 'Longitude'],axis=1)
# y = loc_crime_df[target_column]


# Going for generic prediction
# X = loc_crime_df.drop([target_column,'Crime Name2', 'Latitude', 'Longitude'],axis=1)
# y = loc_crime_df[[target_column, 'Latitude', 'Longitude']]

print(X)
X.shape

In [None]:
# Dropping a Pandas Index Column Using reset_index so that it doesn't have conflict with the second dataframe
df1 =loc_crime_df[['Latitude', 'Longitude']]
df1 = df1.reset_index(drop=True)
print(df1.shape)
df1.head()

In [None]:
df2=pd.DataFrame(dummy_y, columns = [x for x in str_array])
print(df2.shape)

df2.head()

In [None]:
# concatenate a mix of Series and DataFrame objects
y = pd.concat([df1, df2], axis = 1)
y.shape
print(y)

In [None]:
# y = pd.concat([loc_crime_df[['Latitude', 'Longitude']], pd.DataFrame(dummy_y, columns = ['Y1', 'Y2'])], axis = 1, ignore_index= True)

# Split the data into training and testing set


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
                                               random_state=42, shuffle = True) 

#Data was splitted as 80% train data and 20% test data.

# y_train = y_train.values.reshape(-1,1)
# y_test = y_test.values.reshape(-1,1)

print("X_train shape:",X_train.shape)
print("X_test shape:",X_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

In [None]:
# Feature Scaling (Standardize the data)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Prediction with geolocation
# sc_label = MinMaxScaler()
# y_train = sc_label.fit_transform(y_train)[:,-1]
# y_test = sc_label.transform(y_test)[:,-1]

# #Predicting only the last class
# y_train = y_train.iloc[:,-1:]
# y_test = y_test.iloc[:,-1:]

# Predicting all top_n class
y_train = y_train.iloc[:,-top_n:]
y_test = y_test.iloc[:,-top_n:]

In [None]:
y_train

In [None]:
plt.hist(y_train)

# Build and Train the model


In [None]:

# Md Hasan Shahriar
# Tue, Dec 6, 6:51 PM 


feature_number = 9
# with geolocation
# output_dim = top_n+2
output_dim = top_n

model = keras.Sequential(
    [
        keras.Input(shape=feature_number),
        layers.Dense(100, activation="relu"),
        layers.Dense(250, activation="relu"),
        layers.Dense(500, activation="relu"),
        layers.Dense(100, activation="relu"),
        layers.Dense(50, activation="relu"),
        layers.Dense(output_dim, activation="softmax"),
        # layers.Dense(output_dim, activation="sigmoid"), # for binary classification
        # layers.Dense(output_dim, activation="softmax"),
    ]
)

model.summary()


batch_size = 128
epochs = 20

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
#model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])

history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [None]:
plt.plot(history.history['val_loss'], color = "red")
plt.plot(history.history['loss'], color = "blue")

In [None]:
np.round(model.predict(X_test[0:10]))

In [None]:
np.round(y_test[0:10])

# Evaluate the trained model


In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
# # Model / data parameters
# num_classes = 10
# input_shape = (28, 28, 1)



# # Load the data and split it between train and test sets
# (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# # Scale images to the [0, 1] range
# x_train = x_train.astype("float32") / 255
# x_test = x_test.astype("float32") / 255
# # Make sure images have shape (28, 28, 1)
# x_train = np.expand_dims(x_train, -1)
# x_test = np.expand_dims(x_test, -1)
# print("x_train shape:", x_train.shape)
# print(x_train.shape[0], "train samples")
# print(x_test.shape[0], "test samples")


# # convert class vectors to binary class matrices
# y_train = keras.utils.to_categorical(y_train, num_classes)
# y_test = keras.utils.to_categorical(y_test, num_classes)