In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import warnings


from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import precision_recall_fscore_support, plot_confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


import geopandas as gpd
from shapely.geometry import Point, Polygon

# At first we mount the dataset from local file

inputdf = pd.read_csv('Crime.csv')

In [None]:


# sneak peak in the data
inputdf.head(2)


In [None]:
print(inputdf.shape)
# 'column names are:' 
inputdf.columns
inputdf.dtypes

Analyzing the dataset, we observe that columns -


* Incident ID
*	Offence Code
* CR Number
* NIBRS Code
* State (All should be MD)
* Sector, Beat, PRA - meaningless
* Address Number (included in "Block Address")
* Street Prefix (lots of missing values)
* Street Suffix (lots of missing values)
* Police District Number (meaningless)
* Location (pair of Latitude and Longitude)

 are unnecessary. So they are removed.

In [None]:
# dropping the unnecessary columns
#inputdf = data
unnecessary_columns = ['Incident ID', 'Offence Code', 'CR Number', 'NIBRS Code', 'State', 'Sector', 'Beat', 'PRA', 'Address Number', 'Street Prefix', 'Street Suffix', 'Police District Number', 'Location'  ]
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

# Later, Columns related with the effect of crime are removed.

In [None]:
unnecessary_columns = ['Dispatch Date / Time', 'Victims', 'Block Address', 'Street Type', 'Agency', 'End_Date_Time']
for i in unnecessary_columns:
  inputdf = inputdf.drop(i, axis=1)
inputdf.head(2)

Some columns are confusing to be added now. Such as -

* Crime Name3
* Zip Code
* Street Name
* Also we have to take special care of Latitude and Longitude if we want them to include in our feature set.

# Now we have to visualize the data and fix what to predict from these modified feature set

In [None]:
# Montgomery County maintained roads
# https://data.imap.maryland.gov/datasets/maryland::montgomery-county-maintained-roads/explore?location=39.168559%2C-76.905435%2C9.35
# https://catalog.data.gov/dataset/tiger-line-shapefile-2018-county-montgomery-county-md-all-roads-county-based-shapefile
# crs = {'init':'EPSG:4326'}
montgomery_county_map = gpd.read_file('Zipcodes.shp')
print(montgomery_county_map.crs) 
montgomery_county_map.to_crs(epsg=4326).plot()

# Other sources for multiple data view
# https://montgomeryplanning.org/tools/gis-and-mapping/map-library/

# Drop rows from the dataframe based on certain condition applied on a column


In [None]:
# Filter all rows for Latitude is greater than or equal to 35
inputdf = inputdf[inputdf['Latitude'] > 35.0]
# inputdf['Latitude'].hist()

In [None]:
# Filter all rows for Longitude is less than or equal to -70
inputdf = inputdf[inputdf['Longitude'] < -70]
# inputdf['Longitude'].hist()

In [None]:
# inputdf['Crime Name2'].value_counts()
# # Filter all rows for that has occurances less than 5
# inputdf = inputdf[inputdf.columns[inputdf['Crime Name2'].value_counts() > 5]]

In [None]:
inputdf['Crime Name2'].value_counts()

# Export Pandas DataFrame to CSV


In [None]:

# inputdf.to_csv('potential feature set revised.csv')


# ‘geo_df’ that is a copy of our original data frame but with the newly created ‘geometry’ column.


In [None]:
crs = {'init':'EPSG:4326'}
geometry = [Point(xy) for xy in zip(inputdf['Longitude'], inputdf['Latitude'])]
geo_df = gpd.GeoDataFrame(inputdf, 
                          crs = crs, 
                          geometry = geometry)

geo_df.head()

In [None]:
geo_df.plot()

# Visualizing the Crime Data

In [None]:
fig, ax = plt.subplots(figsize = (20,20))
montgomery_county_map.to_crs(epsg=4326).plot(ax=ax, color='lightgrey')
geo_df.plot(column = geo_df['Crime Name1'], ax=ax, cmap = 'rainbow',
            legend = True, #legend_kwds={'shrink': 0.3}, 
            markersize = 10)
ax.set_title('Montgomery County Crime type data Heatmap')
# plt.savefig('Crime Heat Map for Location')

In [None]:
#geo_df['Crime Name1'].hist()
geo_df['Crime Name2'].count()
geo_df['Crime Name2'].value_counts()
# geo_df.groupby('Crime Name2').count() # per group data count


#geo_df['Latitude'].hist()

# Separating Train and Test Set for Crime Prediction for Location

In [None]:
print(inputdf['Crime Name1'].value_counts())

# Creating The Location Based Crime Data Frame

In [379]:
# loc_crime_df = inputdf[['Latitude', 'Longitude', 'Crime Name1']]
loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Zip Code', 'Place', 'Street Name', 'Latitude', 'Longitude']]
dates = pd.to_datetime(inputdf['Start_Date_Time'])
  
# extract Hours from Timestamp 
# rs = dates.dt.hour
# print(rs)
loc_crime_df['dateHour'] = dates.dt.hour
loc_crime_df['day'] = dates.dt.day
loc_crime_df['month'] = dates.dt.month
loc_crime_df.head()


Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,Crime Against Property,Robbery,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,39.03627,-77.0499,21,21,12
1,Crime Against Society,All Other Offenses,GERMANTOWN,DAMASCUS,20872.0,Parking Lot - Residential,COLTRANE,39.27784,-77.2115,17,8,8
2,Crime Against Society,Driving Under the Influence,ROCKVILLE,ROCKVILLE,20850.0,Street - In vehicle,GRANDIN,39.086688,-77.144754,2,3,1
3,Other,All Other Offenses,ROCKVILLE,ROCKVILLE,20850.0,Street - Other,GRANDIN,39.086688,-77.144754,2,3,1
4,Crime Against Property,Shoplifting,GERMANTOWN,GERMANTOWN,20876.0,Retail - Department/Discount Store,FREDERICK,39.198295,-77.2449,17,16,7


In [380]:
print(loc_crime_df.shape)
loc_crime_df.dtypes

(304175, 12)


Crime Name1              object
Crime Name2              object
Police District Name     object
City                     object
Zip Code                float64
Place                    object
Street Name              object
Latitude                float64
Longitude               float64
dateHour                  int64
day                       int64
month                     int64
dtype: object

In [381]:
 # Removing Rows on Count condition
counts = loc_crime_df['Crime Name2'].value_counts()
print(counts)
threshold = 100
loc_crime_df = loc_crime_df.loc[loc_crime_df['Crime Name2'].isin(counts.index[counts >= threshold ])]


All Other Offenses                             72203
Theft From Motor Vehicle                       26252
Drug/Narcotic Violations                       22573
Simple Assault                                 21064
Destruction/Damage/Vandalism of Property       18967
Shoplifting                                    16416
All other Larceny                              13972
Driving Under the Influence                    13723
Theft from Building                            11335
Burglary/Breaking and Entering                  8784
Identity Theft                                  8633
Theft of Motor Vehicle Parts or Accessories     6747
Liquor Law Violations                           6565
Motor Vehicle Theft                             6489
False Pretenses/Swindle/Confidence Game         6438
Disorderly Conduct                              4980
Credit Card/Automatic Teller Machine Fraud      4726
Aggravated Assault                              4313
Trespass of Real Property                     

In [382]:
 # Removing Rows on Count condition 
 # Pruning garbage data
# loc_crime_df = loc_crime_df[~loc_crime_df['Crime Name2'] == "All Other Offenses"]
indexAge = loc_crime_df[ loc_crime_df['Crime Name2'] == "All Other Offenses" ].index
loc_crime_df.drop(indexAge , inplace=True)
loc_crime_df.head()
counts = loc_crime_df['Crime Name2'].value_counts()
print(counts)


Theft From Motor Vehicle                       26252
Drug/Narcotic Violations                       22573
Simple Assault                                 21064
Destruction/Damage/Vandalism of Property       18967
Shoplifting                                    16416
All other Larceny                              13972
Driving Under the Influence                    13723
Theft from Building                            11335
Burglary/Breaking and Entering                  8784
Identity Theft                                  8633
Theft of Motor Vehicle Parts or Accessories     6747
Liquor Law Violations                           6565
Motor Vehicle Theft                             6489
False Pretenses/Swindle/Confidence Game         6438
Disorderly Conduct                              4980
Credit Card/Automatic Teller Machine Fraud      4726
Aggravated Assault                              4313
Trespass of Real Property                       3939
Runaway                                       

# Dropping null values in Zip code

In [383]:
loc_crime_df['Zip Code'].value_counts()
# loc_crime_df['dateHour'].value_counts()

20910.0    21071
20902.0    17815
20906.0    14649
20904.0    14523
20874.0    14136
           ...  
20859.0        1
20914.0        1
2853.0         1
28017.0        1
20990.0        1
Name: Zip Code, Length: 148, dtype: int64

In [384]:
loc_crime_df = loc_crime_df.dropna(axis=0, subset=['Zip Code'])
print(loc_crime_df.shape)

(229017, 12)


In [385]:
# loc_crime_df['Crime Name2'].count()
# loc_crime_df['Crime Name2'].value_counts()

# Transform the data by label encoding

In [386]:
encoded_dict ={}
temp_df = pd.DataFrame()

def label_encoder(y):
    le = LabelEncoder()
    #print(loc_crime_df[y])
    loc_crime_df[y] = le.fit_transform(loc_crime_df[y])
    #print(loc_crime_df[y])
    if y== 'Crime Name1':
        temp_df[y] = le.inverse_transform(loc_crime_df[y])
        encoded_dict[loc_crime_df[y]] = temp_df[y]
    #print(temp_df[y])

#loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']]

#label_list = ['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']
label_list = ['Crime Name1']

for l in label_list:
    label_encoder(l)

print(encoded_dict)
 
#Display transformed data
loc_crime_df.head()

0         Crime Against Property
2          Crime Against Society
4         Crime Against Property
5           Crime Against Person
6          Crime Against Society
                   ...          
312289    Crime Against Property
312290    Crime Against Property
312292    Crime Against Property
312295    Crime Against Property
312296    Crime Against Property
Name: Crime Name1, Length: 229017, dtype: object
0         1
2         2
4         1
5         0
6         2
         ..
312289    1
312290    1
312292    1
312295    1
312296    1
Name: Crime Name1, Length: 229017, dtype: int32
0         Crime Against Property
1          Crime Against Society
2         Crime Against Property
3           Crime Against Person
4          Crime Against Society
                   ...          
229012    Crime Against Property
229013    Crime Against Property
229014    Crime Against Property
229015    Crime Against Property
229016    Crime Against Property
Name: Crime Name1, Length: 229017, dtype: obj

Unnamed: 0,Crime Name1,Crime Name2,Police District Name,City,Zip Code,Place,Street Name,Latitude,Longitude,dateHour,day,month
0,1,Robbery,WHEATON,SILVER SPRING,20902.0,Street - Bus Stop,GEORGIA,39.03627,-77.0499,21,21,12
2,2,Driving Under the Influence,ROCKVILLE,ROCKVILLE,20850.0,Street - In vehicle,GRANDIN,39.086688,-77.144754,2,3,1
4,1,Shoplifting,GERMANTOWN,GERMANTOWN,20876.0,Retail - Department/Discount Store,FREDERICK,39.198295,-77.2449,17,16,7
5,0,Forcible Rape,MONTGOMERY VILLAGE,GAITHERSBURG,20879.0,Residence - Apartment/Condo,SNOUFFER SCHOOL,39.168194,-77.175049,14,21,9
6,2,Prostitution,ROCKVILLE,ROCKVILLE,20850.0,Retail - Salon/Spa,FREDERICK,39.103443,-77.155941,12,28,2


In [393]:
print(loc_crime_df['Crime Name1'].value_counts())
print(temp_df['Crime Name1'].value_counts())

1    138559
2     56619
0     29791
3      3564
4       484
Name: Crime Name1, dtype: int64
Crime Against Property    138559
Crime Against Society      56619
Crime Against Person       29791
Not a Crime                 3564
Other                        484
Name: Crime Name1, dtype: int64


In [388]:
# temp_df = pd.DataFrame()
# def label_decoder(y):
#     #le = LabelEncoder()
#     for value in loc_crime_df[y]:
#         # finding key with value in dict
#         temp_df[y] = list(encoded_dict.keys())[list(encoded_dict.values()).index(value)]

# #loc_crime_df = inputdf[['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']]

# #label_list = ['Crime Name1','Crime Name2', 'Police District Name', 'City', 'Place', 'Street Name']
# label_list = ['Crime Name1']

# for l in label_list:
#     label_decoder(l)


# print(temp_df['Crime Name1'])

In [389]:
#Divide the dataset into independent and dependent variables
# X = loc_crime_df.drop(['Crime Name2'],axis=1)
# y = loc_crime_df['Crime Name2']


# Going for generic prediction
X = loc_crime_df.drop(['Crime Name1','Crime Name2', 'Latitude', 'Longitude'],axis=1)
y = loc_crime_df['Crime Name1']

print(X)


       Police District Name                City  Zip Code  \
0                   WHEATON       SILVER SPRING   20902.0   
2                 ROCKVILLE           ROCKVILLE   20850.0   
4                GERMANTOWN          GERMANTOWN   20876.0   
5        MONTGOMERY VILLAGE        GAITHERSBURG   20879.0   
6                 ROCKVILLE           ROCKVILLE   20850.0   
...                     ...                 ...       ...   
312289              WHEATON       SILVER SPRING   20902.0   
312290   MONTGOMERY VILLAGE  MONTGOMERY VILLAGE   20886.0   
312292             BETHESDA            BETHESDA   20814.0   
312295   MONTGOMERY VILLAGE        GAITHERSBURG   20877.0   
312296        SILVER SPRING       SILVER SPRING   20910.0   

                                     Place      Street Name  dateHour  day  \
0                        Street - Bus Stop          GEORGIA        21   21   
2                      Street - In vehicle          GRANDIN         2    3   
4       Retail - Department/Disco

In [390]:
print(y.value_counts())

1    138559
2     56619
0     29791
3      3564
4       484
Name: Crime Name1, dtype: int64


In [391]:
#Split the data into training and testing set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
                                               random_state=42, shuffle = True) 

#Data was splitted as 80% train data and 20% test data.

# y_train = y_train.values.reshape(-1,1)
# y_test = y_test.values.reshape(-1,1)

print("X_train shape:",X_train.shape)
print("X_test shape:",X_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

X_train shape: (183213, 8)
X_test shape: (45804, 8)
y_train shape: (183213,)
y_test shape: (45804,)


In [392]:
#Feature Scaling (Standardize the data)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

ValueError: could not convert string to float: 'BETHESDA'

# We will build all six models and compare their accuracy scores.



In [None]:
#To store results of models, we create two dictionaries
result_dict_train = {}
result_dict_test = {}

# Machine Leaning Models and their performance


In [None]:
models = {'Naïve Bayes Classifier': GaussianNB, 'Decision Tree Classifier' : DecisionTreeClassifier, 'KNN Classifier': KNeighborsClassifier,
        'Random Forest Classifier': RandomForestClassifier,'Logistic Regression': LogisticRegression}
        # , 'Support Vector Classifier': SVC}
# for keys, values in sorted(models.items()):
#     print(keys +" is "+ values)

warnings.filterwarnings("ignore")

# for name, model in models.items():
#     print(name +" is "+ model)
    

In [None]:
for model_name, model_function in models.items():
    print(model_name +' is running')
    try:
        model = model_function(random_state = 42)
    except:
        if(model_name=='Logistic Regression'):
            model = model_function(solver='lbfgs', max_iter=1000)
        model = model_function()
    accuracies = cross_val_score(model, X_train, y_train, cv=5)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    #Obtain accuracy
    print("Train Score:",np.mean(accuracies))
    print("Test Score:",model.score(X_test,y_test))

    #Confusion matrix
    plot_confusion_matrix(model, X_test, y_test)  
    plt.show()


    #Store results in the dictionaries
    result_dict_train['Train Score for '+model_name] = np.mean(accuracies)
    result_dict_test['Test Score for '+model_name] = model.score(X_test,y_test)
    


In [None]:
# lengthy_models = {'Support Vector Classifier': SVC,
#         'Random Forest Classifier': RandomForestClassifier}
# for model_name, model_function in lengthy_models.items():
#     print(model_name +' is running')
#     try:
#         model = model_function(random_state = 42)
#     except:
#         model = model_function()
#     accuracies = cross_val_score(model, X_train, y_train, cv=5)
#     model.fit(X_train,y_train)
#     y_pred = model.predict(X_test)

#     #Obtain accuracy
#     print("Train Score:",np.mean(accuracies))
#     print("Test Score:",model.score(X_test,y_test))


#     # #Store results in the dictionaries
#     # result_dict_train['Train Score for '+model_name] = np.mean(accuracies)
#     # result_dict_test['Test Score for '+model_name] = model.score(X_test,y_test)

# Compare Accuracy Scores

In [None]:
df_result_train = pd.DataFrame.from_dict(result_dict_train,orient = "index", columns=["Score"])
print(df_result_train)

df_result_test = pd.DataFrame.from_dict(result_dict_test,orient = "index",columns=["Score"])
df_result_test


# Display the accuracy scores

In [None]:
import seaborn as sns

fig,ax = plt.subplots(1,2,figsize=(10,5))
sns.barplot(x = df_result_train.index,y = df_result_train.Score,ax = ax[0])
sns.barplot(x = df_result_test.index,y = df_result_test.Score,ax = ax[1])
ax[0].set_xticklabels(df_result_train.index,rotation = 75)
ax[1].set_xticklabels(df_result_test.index,rotation = 75)
#plt.show()
# plt.savefig('Predicting Specific Crime')