## Model Training

##### Importing Libraries

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Feature Transformation
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer

# Modelling
from sklearn.metrics import accuracy_score, f1_score
from catboost import CatBoostClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import warnings

In [2]:
df = pd.read_csv('crime.csv', encoding='latin-1')

In [3]:
df = df.head(10000)
df.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


##### Preparing X and Y variables

In [4]:
df = df.drop(columns=['INCIDENT_NUMBER'], axis=1)

In [5]:
df['OCCURRED_ON_DATE'] = pd.to_datetime(df['OCCURRED_ON_DATE'])
print(df['OCCURRED_ON_DATE'].dtype)

datetime64[ns]


In [6]:
# Extract date features
df['YEAR'] = df['OCCURRED_ON_DATE'].dt.year
df['MONTH'] = df['OCCURRED_ON_DATE'].dt.month
df['DAY'] = df['OCCURRED_ON_DATE'].dt.day
df['HOUR'] = df['OCCURRED_ON_DATE'].dt.hour

In [7]:
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location,DAY
0,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)",2
1,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)",21
2,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)",3
3,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)",3
4,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)",3


In [8]:
# Transform location data into latitude and longitude features
df['LATITUDE'] = df['Location'].str.extract(r'\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)')[0].astype(float)
df['LONGITUDE'] = df['Location'].str.extract(r'\(([-+]?[0-9]*\.?[0-9]+),\s*([-+]?[0-9]*\.?[0-9]+)\)')[1].astype(float)

In [9]:
df.drop(columns=['Location', 'Lat', 'Long'], axis=1, inplace=True)
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE
0,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371
1,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603
2,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429
3,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664
4,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361


In [10]:
# Split the REPORTING_AREA feature into numeric and non-numeric components
df['REPORTING_AREA'] = df['REPORTING_AREA'].str.extract(r'(\d+)')[0].astype(float)
df['REPORTING_AREA_STR'] = df['REPORTING_AREA'].astype(str)
df.drop('REPORTING_AREA', axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,Larceny,LARCENY ALL OTHERS,D14,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371,808.0
1,1402,Vandalism,VANDALISM,C11,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603,347.0
2,3410,Towed,TOWED MOTOR VEHICLE,D4,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429,151.0
3,3114,Investigate Property,INVESTIGATE PROPERTY,D4,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664,272.0
4,3114,Investigate Property,INVESTIGATE PROPERTY,B3,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361,421.0


In [12]:
df.drop(columns=['OFFENSE_DESCRIPTION', 'OCCURRED_ON_DATE'], axis=1,inplace=True)
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,Larceny,D14,,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371,808.0
1,1402,Vandalism,C11,,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603,347.0
2,3410,Towed,D4,,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429,151.0
3,3114,Investigate Property,D4,,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664,272.0
4,3114,Investigate Property,B3,,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361,421.0


In [13]:
df.isna().sum()

OFFENSE_CODE             0
OFFENSE_CODE_GROUP       0
DISTRICT               165
SHOOTING              9977
YEAR                     0
MONTH                    0
DAY_OF_WEEK              0
HOUR                     0
UCR_PART                 2
STREET                 227
DAY                      0
LATITUDE                 0
LONGITUDE                0
REPORTING_AREA_STR       0
dtype: int64

In [14]:
df['REPORTING_AREA_STR'].dtype

dtype('O')

In [15]:
len(df.columns)

14

In [16]:
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,Larceny,D14,,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371,808.0
1,1402,Vandalism,C11,,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603,347.0
2,3410,Towed,D4,,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429,151.0
3,3114,Investigate Property,D4,,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664,272.0
4,3114,Investigate Property,B3,,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361,421.0


In [17]:
# Define categorical and numeric features
categorical_columns = ['DAY_OF_WEEK', 'DISTRICT', 'UCR_PART', 'STREET','DAY_OF_WEEK','REPORTING_AREA_STR']
numerical_columns = ['LATITUDE', 'LONGITUDE', 'YEAR', 'MONTH', 'DAY', 'HOUR']

# Define preprocessing steps for categorical features
cat_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal_encoder", OrdinalEncoder()),
        ("scaler", StandardScaler(with_mean=False))
    ]
)

# Define preprocessing steps for numeric features
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('standard_scaler', StandardScaler())
    ]
)

# Define column transformer to apply preprocessing steps to each feature type
preprocessor = ColumnTransformer(transformers=[
    ('num_pipeline', num_transformer, numerical_columns),
    ('cat_pipeline', cat_transformer, categorical_columns)
])

# Extract target variable
target_variable = df['OFFENSE_CODE_GROUP']
# Drop target variable from the DataFrame
df = df.drop(columns=['OFFENSE_CODE_GROUP'])

# Apply preprocessing steps to remaining features
X = preprocessor.fit_transform(df)

# Your preprocessed data is now in the variable X and your target variable is in target_variable


In [19]:
print(preprocessor)

ColumnTransformer(transformers=[('num_pipeline',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler())]),
                                 ['LATITUDE', 'LONGITUDE', 'YEAR', 'MONTH',
                                  'DAY', 'HOUR']),
                                ('cat_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ordinal_encoder',
                                                  OrdinalEncoder()),
                                                 ('scaler',
                                                  StandardScaler(with_mean=False))]),
                                 ['DAY_OF_WEEK', 'DISTRICT', 'UCR_PART',
                                  'STREET', 'D

In [20]:
import os
import pickle
# Create the artifacts subdirectory if it does not exist
if not os.path.exists('../artifacts'):
    os.makedirs('../artifacts')

# Save the trained model as a pickle file in the artifacts subdirectory
file_path = os.path.join('../artifacts/', 'proprocessor.pkl')
with open(file_path, 'wb') as file:
    pickle.dump(preprocessor, file)
    

In [21]:

# Open the file containing the pickled data
with open('../artifacts/proprocessor.pkl', 'rb') as file:
    # Load the pickled data
    preprocessor = pickle.load(file)

# Print the loaded data
print(preprocessor)

ColumnTransformer(transformers=[('num_pipeline',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('standard_scaler',
                                                  StandardScaler())]),
                                 ['LATITUDE', 'LONGITUDE', 'YEAR', 'MONTH',
                                  'DAY', 'HOUR']),
                                ('cat_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ordinal_encoder',
                                                  OrdinalEncoder()),
                                                 ('scaler',
                                                  StandardScaler(with_mean=False))]),
                                 ['DAY_OF_WEEK', 'DISTRICT', 'UCR_PART',
                                  'STREET', 'D

In [22]:
label_encoder = LabelEncoder()
target_variable = label_encoder.fit_transform(target_variable)
target_variable

array([27, 54, 53, ..., 27, 27,  0])

In [26]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,target_variable,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((8000, 12), (2000, 12))

In [27]:
import torch
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=3, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        return out

# Initialize the model
model = BiLSTM(input_size=12, hidden_size=128, num_classes=64)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Convert the data to PyTorch tensors
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).long()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).long()

# Train the model
for epoch in range(1000):
    optimizer.zero_grad()
    #print(X_train.shape)
    outputs = model(X_train.view(-1, 1, X_train.shape[1]))
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()


In [28]:
from sklearn.metrics import f1_score, accuracy_score

# Evaluate the model on the test data
with torch.no_grad():
    model.eval()
    test_outputs = model(X_test.view(-1, 1, X_test.shape[1]))
    test_loss = criterion(test_outputs, y_test)
    _, test_preds = torch.max(test_outputs, 1)

# Compute the accuracy and f1-score
test_acc = accuracy_score(y_test, test_preds)
test_f1 = f1_score(y_test, test_preds, average='macro')

print('Test Accuracy: {:.2f}%'.format(test_acc * 100))
print('Test F1-Score: {:.4f}'.format(test_f1))


Test Accuracy: 25.65%
Test F1-Score: 0.1020


In [29]:
import os
# Create the artifacts subdirectory if it does not exist
if not os.path.exists('../artifacts'):
    os.makedirs('../artifacts')

# Save the trained model as a pickle file in the artifacts subdirectory
file_path = os.path.join('../artifacts', 'model.pkl')
with open(file_path, 'wb') as file:
    pickle.dump(model, file)
    

In [30]:
import pickle
# Open the file containing the pickled data
with open('../artifacts/model.pkl', 'rb') as file:
    # Load the pickled data
    model = pickle.load(file)

# Print the loaded data
print(model)

BiLSTM(
  (lstm): LSTM(12, 128, num_layers=3, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [32]:
df.head()

Unnamed: 0,OFFENSE_CODE,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
0,619,D14,,2018,9,Sunday,13,Part One,LINCOLN ST,2,42.357791,-71.139371,808.0
1,1402,C11,,2018,8,Tuesday,0,Part Two,HECLA ST,21,42.306821,-71.0603,347.0
2,3410,D4,,2018,9,Monday,19,Part Three,CAZENOVE ST,3,42.346589,-71.072429,151.0
3,3114,D4,,2018,9,Monday,21,Part Three,NEWCOMB ST,3,42.334182,-71.078664,272.0
4,3114,B3,,2018,9,Monday,21,Part Three,DELHI ST,3,42.275365,-71.090361,421.0


In [67]:
import pandas as pd

data = {
    'OFFENSE_CODE': 1402,
    'DAY_OF_WEEK' : 'Sunday',
    'DISTRICT' : 'C11', 
    'UCR_PART' : 'Part One', 
    'STREET' : 'LINCOLN ST',
    'REPORTING_AREA_STR': '808.0',
    'LATITUDE' : 42.357791, 
    'LONGITUDE' : -71.060300, 
    'YEAR' : 2017, 
    'MONTH' : 7, 
    'DAY' : 4, 
    'HOUR' : 15 
}


# Convert the data dictionary to a Pandas DataFrame
data_df = pd.DataFrame([data])

# Print the resulting DataFrame
data_df

Unnamed: 0,OFFENSE_CODE,DAY_OF_WEEK,DISTRICT,UCR_PART,STREET,REPORTING_AREA_STR,LATITUDE,LONGITUDE,YEAR,MONTH,DAY,HOUR
0,1402,Sunday,C11,Part One,LINCOLN ST,808.0,42.357791,-71.0603,2017,7,4,15


In [68]:
# Append the df DataFrame to the existing dataset using pd.concat()
new_df = pd.concat([df, data_df], ignore_index=True)

# Write the new DataFrame to a CSV file
new_df.tail()

Unnamed: 0,OFFENSE_CODE,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,DAY,LATITUDE,LONGITUDE,REPORTING_AREA_STR
9996,613,A1,,2018,7,Sunday,18,Part One,FANEUIL HALL MARKETPLACE,29,42.35998,-71.054741,94.0
9997,619,A1,,2018,7,Sunday,19,Part One,FANEUIL HALL SQ,29,42.360205,-71.056208,93.0
9998,611,C11,,2018,7,Sunday,10,Part One,PARK ST,29,42.297866,-71.063325,360.0
9999,423,C6,,2018,7,Sunday,18,Part One,DORCHESTER AVE,29,42.331384,-71.056975,194.0
10000,1402,C11,,2017,7,Sunday,15,Part One,LINCOLN ST,4,42.357791,-71.0603,808.0


In [69]:
# Open the file containing the pickled data
with open('../artifacts/proprocessor.pkl', 'rb') as file:
    # Load the pickled data
    preprocessor = pickle.load(file)
new_arr = preprocessor.fit_transform(new_df)

In [71]:
new_arr

array([[  0.26942167,  -0.26918745,   0.04812944, ...,   1.76259019,
          1.48336224,   2.8332212 ],
       [  0.26458504,  -0.26470431,   0.04812944, ...,   1.45364581,
          2.4722704 ,   1.02991419],
       [  0.26835864,  -0.26539201,   0.04812944, ...,   0.53038428,
          0.49445408,   0.22206328],
       ...,
       [  0.26373525,  -0.26487582,   0.04812944, ...,   2.24832933,
          1.48336224,   1.08734435],
       [  0.26691587,  -0.26451579,   0.04812944, ...,   0.86968736,
          1.48336224,   0.39435376],
       [  0.26942164,  -0.26470429, -13.70451473, ...,   1.76259019,
          1.48336224,   2.8332212 ]])

In [50]:
last_row = new_arr[-1,:]
last_row_reshape = torch.Tensor(last_row).unsqueeze(0).unsqueeze(0)
with torch.no_grad():
    single_output = model(last_row_reshape)

# Print the output tensor
print(single_output)

tensor([[  3.8547,  -5.9225,  -1.2503,   7.7098,  -3.2436,  -9.6152,  -5.9390,
          -0.8899,  -3.3463,  -4.9685,  -3.8018,  -5.6151,  -9.4909,  -4.8398,
          -8.2295,  -4.7098,  -4.6182,  -0.9474,  -9.9689,  -4.5876,  -3.7327,
          -2.2778,  -7.0621,  -0.5442,   5.5222,   5.4248,  -0.5601,   5.5840,
           5.7775,   2.3004,   1.0684, -11.1502,   4.7657,   2.9800,   2.9866,
           5.3273,  -8.7284,  -9.9238,   1.7176,   0.7949,   1.8774,  -6.6867,
           4.9114,  10.4737,   4.4057,  -7.4115,  -8.2091,   5.5767,  -4.5788,
           4.4305,  -3.2109,  -6.0184,  -7.4692,   4.5329,  -7.8900,   1.5733,
          -9.7571,   2.1985,  -4.6501,  -4.4089,  -4.3160,  -4.7460,  -5.1413,
          -4.3496]])


In [54]:
single_output_arr = single_output.numpy()
single_output_arr.shape

(1, 64)

In [57]:
# Get the indices of the top 5 values from the array
top5_indices = np.argsort(single_output_arr)[0, -5:]

# Print the indices of the top 5 values
print(top5_indices)

[47 27 28  3 43]


In [58]:
target_variable

array([27, 54, 53, ..., 27, 27,  0])

In [60]:
new_df = pd.read_csv('crime.csv',encoding='latin-1')
new_df.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


In [64]:
list(new_df['OFFENSE_CODE_GROUP'].unique())

['Larceny',
 'Vandalism',
 'Towed',
 'Investigate Property',
 'Motor Vehicle Accident Response',
 'Auto Theft',
 'Verbal Disputes',
 'Robbery',
 'Fire Related Reports',
 'Other',
 'Property Lost',
 'Medical Assistance',
 'Assembly or Gathering Violations',
 'Larceny From Motor Vehicle',
 'Residential Burglary',
 'Simple Assault',
 'Restraining Order Violations',
 'Violations',
 'Harassment',
 'Ballistics',
 'Property Found',
 'Police Service Incidents',
 'Drug Violation',
 'Warrant Arrests',
 'Disorderly Conduct',
 'Property Related Damage',
 'Missing Person Reported',
 'Investigate Person',
 'Fraud',
 'Aggravated Assault',
 'License Plate Related Incidents',
 'Firearm Violations',
 'Other Burglary',
 'Arson',
 'Bomb Hoax',
 'Harbor Related Incidents',
 'Counterfeiting',
 'Liquor Violation',
 'Firearm Discovery',
 'Landlord/Tenant Disputes',
 'Missing Person Located',
 'Auto Theft Recovery',
 'Service',
 'Operating Under the Influence',
 'Confidence Games',
 'Search Warrants',
 'Licens