## Library Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set()

In [2]:
if not os.path.exists('../Clean Dataset/Sub Crimes By Neighborhood/Preprocessed_data'):
    os.makedirs('../Clean Dataset/Sub Crimes By Neighborhood/Preprocessed_data')

In [3]:
crime_type = input('Enter Crime Type: ')

Enter Crime Type: Shooting


## Importing Baltimore calls for service dataset

In [4]:
data = pd.read_csv('../Clean Dataset/Sub Crimes By Neighborhood/%s_By_Neighborhood.csv'%crime_type)
data.head()

Unnamed: 0,CrimeDate,CrimeTime,CrimeCode,Location,Description,Inside/Outside,Weapon,Post,District,Neighborhood,Longitude,Latitude,Location 1,Premise,vri_name1,Total Incidents
0,06/06/2020,13:19:00,9S,1800 W NORTH AV,SHOOTING,Outside,FIREARM,734.0,WESTERN,SANDTOWN-WINCHESTER,-76.64658,39.309955,,STREET,Western,1
1,06/05/2020,17:29:36,9S,500 E COLD SPRING LA,SHOOTING,Outside,FIREARM,832.0,NORTHERN,VIOLETVILLE,-76.609528,39.346229,,STREET,,1
2,06/04/2020,03:02:00,9S,2200 AIKEN ST,SHOOTING,Outside,FIREARM,312.0,EASTERN,EAST BALTIMORE MIDWA,-76.599404,39.315165,,STREET,Eastern 1,1
3,06/04/2020,18:23:00,9S,3100 KENTUCKY AV,SHOOTING,Outside,FIREARM,432.0,NORTHEAST,BELAIR-EDISON,-76.569833,39.323288,,STREET,Northeastern,1
4,06/03/2020,21:14:27,9S,2100 MARYLAND AV,SHOOTING,Outside,FIREARM,232.0,NORTHERN,BAYVIEW,-76.618108,39.313225,,PARKING LOT,,1


In [5]:
data.describe()

Unnamed: 0,Longitude,Latitude,Location 1,Total Incidents
count,3882.0,3882.0,0.0,3883.0
mean,-76.626196,39.304772,,1.0
std,0.039636,0.027746,,0.0
min,-76.709874,39.222026,,1.0
25%,-76.656083,39.290222,,1.0
50%,-76.632024,39.304318,,1.0
75%,-76.595747,39.320336,,1.0
max,-76.530183,39.372106,,1.0


In [6]:
columns_in_use = ['CrimeDate', 'District']
columns_in_use

['CrimeDate', 'District']

In [7]:
data = data[columns_in_use]
data.head()

Unnamed: 0,CrimeDate,District
0,06/06/2020,WESTERN
1,06/05/2020,NORTHERN
2,06/04/2020,EASTERN
3,06/04/2020,NORTHEAST
4,06/03/2020,NORTHERN


In [8]:
data.describe()

Unnamed: 0,CrimeDate,District
count,3883,3883
unique,1695,9
top,10/12/2019,WESTERN
freq,12,639


In [9]:
null_count = 0

for i in columns_in_use:
    rows_with_null_values = data[data[i].isnull()].index
    print ('Number of Rows with null values in Column %s = %s'%(i, len(rows_with_null_values)))
    data = data.drop(rows_with_null_values, axis = 0)
    null_count += len(rows_with_null_values)

print ('Number of rows droped = ', null_count)
print ('Number of rows after droping null values = ', data.shape[0])
print ('Data Frame Shape: \n\t Number of Rows = %s \n\t Number of Columns = %s '%(data.shape[0], data.shape[1]))

Number of Rows with null values in Column CrimeDate = 0
Number of Rows with null values in Column District = 0
Number of rows droped =  0
Number of rows after droping null values =  3883
Data Frame Shape: 
	 Number of Rows = 3883 
	 Number of Columns = 2 


## Droping Rows with Unknown Police Districts

In [10]:
rows_with_unknowon_district = data[data['District'] == 'UNKNOWN'].index
print ('Number of Rows with null values in Column %s = %s'%(i, len(rows_with_unknowon_district)))
data = data.drop(rows_with_unknowon_district, axis = 0)

null_count += len(rows_with_unknowon_district)
print ('Number of rows droped = ', null_count)
print ('Number of rows after droping null values = ', data.shape[0])
print ('Data Frame Shape: \n\t Number of Rows = %s \n\t Number of Columns = %s '%(data.shape[0], data.shape[1]))

Number of Rows with null values in Column District = 0
Number of rows droped =  0
Number of rows after droping null values =  3883
Data Frame Shape: 
	 Number of Rows = 3883 
	 Number of Columns = 2 


In [11]:
data['CrimeDate']  = pd.to_datetime(data['CrimeDate'])
data.head()

Unnamed: 0,CrimeDate,District
0,2020-06-06,WESTERN
1,2020-06-05,NORTHERN
2,2020-06-04,EASTERN
3,2020-06-04,NORTHEAST
4,2020-06-03,NORTHERN


## Creating Different Time features

In [12]:
day_of_week = data['CrimeDate'].dt.dayofweek
week = data['CrimeDate'].dt.week
month = data['CrimeDate'].dt.month
year = data['CrimeDate'].dt.year

data['Day'] = day_of_week
data['Week'] = week
data['Month'] = month
data['Year'] = year

data.head()

Unnamed: 0,CrimeDate,District,Day,Week,Month,Year
0,2020-06-06,WESTERN,5,23,6,2020
1,2020-06-05,NORTHERN,4,23,6,2020
2,2020-06-04,EASTERN,3,23,6,2020
3,2020-06-04,NORTHEAST,3,23,6,2020
4,2020-06-03,NORTHERN,2,23,6,2020


## Consider data from 2014 and beyond

In [13]:
data = data[data['Year']>= 2014]

In [14]:
data.describe()

Unnamed: 0,Day,Week,Month,Year
count,3883.0,3883.0,3883.0,3883.0
mean,2.978882,27.143446,6.652331,2016.990214
std,2.037785,14.025599,3.224338,1.750957
min,0.0,1.0,1.0,2014.0
25%,1.0,16.0,4.0,2016.0
50%,3.0,27.0,7.0,2017.0
75%,5.0,38.0,9.0,2019.0
max,6.0,53.0,12.0,2020.0


## Creating Times Series Dataset

In [15]:
def ts_data_creation(group_by_parameter, data):
    if group_by_parameter == 'Week' or group_by_parameter == 'Month':
        pass
    else:
        raise AttributeError('group_by_parameter')
    
    if group_by_parameter == 'Week':
        parameter_name = 'Week_Number'
    else:
        parameter_name = 'Month_Number'
        
    PoliceDistricts = set(data['District'])
    ts_data = {}
    for district in PoliceDistricts:
        ts_data_temp = []
        for year_num, year_value in data[data['District'] == district].groupby('Year'):
            for parameter_num, parameter_value in year_value[year_value['District'] == district].groupby(group_by_parameter):
                total_crimes = len(parameter_value)
                year_parameter_num = "%s_%s"%(year_num, parameter_num)
                ts_data_temp.append([year_num] + [district, parameter_num, year_parameter_num] + [total_crimes])
        ts_data_temp = pd.DataFrame(ts_data_temp, columns = ['Year', 'PoliceDistrict', parameter_name, 'Year_%s'%parameter_name, 'NoOfCrimes'])
        ts_data[district] = ts_data_temp
    
    return ts_data

In [16]:
ts_data_week = ts_data_creation('Week', data)
ts_data_week

{'CENTRAL':      Year PoliceDistrict  Week_Number Year_Week_Number  NoOfCrimes
 0    2014        CENTRAL            1           2014_1           1
 1    2014        CENTRAL            3           2014_3           1
 2    2014        CENTRAL            4           2014_4           1
 3    2014        CENTRAL            9           2014_9           1
 4    2014        CENTRAL           18          2014_18           1
 ..    ...            ...          ...              ...         ...
 149  2020        CENTRAL           18          2020_18           1
 150  2020        CENTRAL           19          2020_19           1
 151  2020        CENTRAL           20          2020_20           1
 152  2020        CENTRAL           21          2020_21           1
 153  2020        CENTRAL           22          2020_22           2
 
 [154 rows x 5 columns],
 'SOUTHERN':      Year PoliceDistrict  Week_Number Year_Week_Number  NoOfCrimes
 0    2014       SOUTHERN            4           2014_4           

## Data Preprocessing
Adding missing weeks to the data set. If a week is absent then NoOfCrimes occured in that week is 0 else the actual crime.

Create a Dictionary containing the year and the number of week in it. Last year i.e. 2020 will have missing weeks till the last data entry in the dataset 

In [17]:
from isoweek import Week

year_range = [2014, 2020]
year_week_dictionary = {}
for year in range(year_range[0], year_range[1]):
    year_week = []
    last_week = Week.last_week_of_year(year)[1]
    for week in range(1, last_week+1):
        year_and_week_number = "%s_%s"%(year, week)
        year_week.append([year, week ,year_and_week_number])
    year_week_dictionary[year] = year_week

year_week = []
district_year_data = data[data['Year'] == 2020]
district_week_numbers = district_year_data['Week'].tolist()
last_week_number = max(district_week_numbers)

for week in range(1, last_week_number + 1):
    year_and_week_number = "2020_%s"%week
    year_week.append([2020, week ,year_and_week_number])

year_week_dictionary[2020] = year_week

Create a dataframe of year and number of weeks for every district. 

In [18]:
district_data_dictionary = {}
for district in ts_data_week:
    district_data = ts_data_week[district]
    year_data = pd.DataFrame()
    for year in year_week_dictionary:
        year_week_df = pd.DataFrame(year_week_dictionary[year], columns=['Year', 'Week_Number', 'Year_Week_Number'])
        year_data = pd.concat([year_data, year_week_df], axis = 0)
    district_data_dictionary[district] = year_data

If a given week is missing in a particular year then number of crimes is 0 
else the number of crimes is the actual number of crime

In [19]:
for district in district_data_dictionary:
    target = district_data_dictionary[district]
    available_data = ts_data_week[district]
    NoOfCrimes_list = []
    for year_week_number in target['Year_Week_Number']:
        if year_week_number in available_data['Year_Week_Number'].values:
            row = available_data[available_data['Year_Week_Number'] == year_week_number]
            NoOfCrimes = row['NoOfCrimes'].values[0]
        else:
            NoOfCrimes = 0
        NoOfCrimes_list.append(NoOfCrimes)
    district_data_dictionary[district]['NoOfCrimes'] = NoOfCrimes_list
    district_data_dictionary[district]['PoliceDistrict'] = [district] * len(target)
    district_data_dictionary[district] = district_data_dictionary[district][['Year','PoliceDistrict','Week_Number','Year_Week_Number','NoOfCrimes']]

ts_data_week = district_data_dictionary

In [20]:
df = pd.DataFrame()

for district in ts_data_week:
    district_data = ts_data_week[district]
    df = pd.concat([df, district_data], axis = 0)
    
file_path = os.path.sep.join(['../Clean Dataset/Sub Crimes By Neighborhood/Preprocessed_data','Preprocessed_%s_data.csv'%crime_type])
df.to_csv(file_path, index = False)