<a href="https://colab.research.google.com/github/Sparrow0hawk/crime_sim_toolkit/blob/develop-CSS-match/data_manipulation/CrimeDes_toCSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Map Crime Descriptions to Crime Severity Scores 



In [1]:
# get the crime seversity score xlsx file
!wget -O cssdatatool.xls https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/crimeandjustice/datasets/crimeseverityscoreexperimentalstatistics/current/cssdatatool.xls

--2019-09-09 11:02:16--  https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/crimeandjustice/datasets/crimeseverityscoreexperimentalstatistics/current/cssdatatool.xls
Resolving www.ons.gov.uk (www.ons.gov.uk)... 104.20.61.76, 104.20.60.76, 2606:4700:10::6814:3c4c, ...
Connecting to www.ons.gov.uk (www.ons.gov.uk)|104.20.61.76|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/vnd.ms-excel]
Saving to: ‘cssdatatool.xls’

cssdatatool.xls         [   <=>              ]   8.96M  13.7MB/s    in 0.7s    

2019-09-09 11:02:23 (13.7 MB/s) - ‘cssdatatool.xls’ saved [9394176]



In [2]:
# check its in /content
!ls

cssdatatool.xls  sample_data


In [0]:
import pandas as pd
import numpy as np

In [0]:
# open files including existing crime description 
crime_des_data = pd.read_csv('https://raw.githubusercontent.com/Sparrow0hawk/crime_sim_toolkit/develop-CSS-match/crime_sim_toolkit/src/simple_policeuk_perc_offence.csv', index_col=0)

CSS_score_xls = pd.ExcelFile('cssdatatool.xls')

In [0]:
# set crime descriptions all to lowercase for easier string matching
crime_des_data.Offence_Description = crime_des_data.Offence_Description.str.lower()

# specify all crime codes as strings
crime_des_data.Offence_Code = crime_des_data.Offence_Code.astype(str)

In [0]:
# get excel sheet with weights
weights = CSS_score_xls.parse('List of weights', skiprows=4)

In [7]:
# visualise
weights.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Offence,Weight
0,,,,
1,VICTIM BASED CRIME - VIOLENCE,,,
2,,,,
3,,"1, 4.1/2/10",Homicide,7973.393423
4,,,,


In [8]:
# ignore first column and visualise
weights = weights.iloc[:,1:]

weights.head()

Unnamed: 0,Unnamed: 1,Offence,Weight
0,,,
1,,,
2,,,
3,"1, 4.1/2/10",Homicide,7973.393423
4,,,


In [0]:
# rename columns
weights.columns = ['Offence_cd','Offence_Description','Weight']

# set all strings to lower and type string
weights.Offence_Description = weights.Offence_Description.str.lower()

weights.Offence_cd = weights.Offence_cd.astype(str)

# remove rows with NAs
weights.dropna(inplace=True)

In [10]:
# visualise
weights.head()

Unnamed: 0,Offence_cd,Offence_Description,Weight
3,"1, 4.1/2/10",homicide,7973.393423
5,4.4,causing death or serious injury by dangerous d...,1022.814458
6,4.6,causing death by careless driving when under t...,1512.002797
7,4.8,causing death by careless or inconsiderate dri...,111.361322
8,4.4/6/8,causing death by dangerous or careless driving,690.720426


Start constructing dataframe that includes weights matched by either crime code or string match to crime description

In [11]:
# create a copy of the crime description dataframe we can work with
basic_frame = crime_des_data.copy()

# remove the duplicates sections for each local authority to get just all 132 crime descriptions
basic_frame = basic_frame.drop_duplicates(subset=['Offence_Description'])

# check shape is as expected
basic_frame.shape

(132, 8)

In [12]:
# select only useful columns and visualise
basic_frame = basic_frame[['Policeuk_Cat', 'Offence_Group', 'Offence_Subgroup',
       'Offence_Description', 'Offence_Code']]

basic_frame.head()

Unnamed: 0,Policeuk_Cat,Offence_Group,Offence_Subgroup,Offence_Description,Offence_Code
0,Other crime,Fraud offences,Fraud: action fraud,fraud offences recorded by action fraud,AF
1,Other crime,Miscellaneous crimes,Miscellaneous crimes,absconding from lawful custody,80
2,Violence and Sexual Offences,Sexual offences,Other sexual offences,abuse of children through sexual exploitation,71
3,Violence and Sexual Offences,Sexual offences,Other sexual offences,abuse of position of trust of a sexual nature,73
4,Burglary,Theft offences,Non-domestic burglary,aggravated burglary business and community,31A


In [0]:
# join the weights column from weights dataset based on string matches of Offence_Descriptions
weights_frame = basic_frame.set_index('Offence_Description').join(weights[weights.Offence_Description.isin(crime_des_data.Offence_Description.unique().tolist())].set_index('Offence_Description')['Weight'])

# reset the index back to numeric
weights_frame.reset_index(inplace=True)

In [14]:
# visualise
weights_frame.head()

Unnamed: 0,Offence_Description,Policeuk_Cat,Offence_Group,Offence_Subgroup,Offence_Code,Weight
0,absconding from lawful custody,Other crime,Miscellaneous crimes,Miscellaneous crimes,80,192.375992
1,abuse of children through sexual exploitation,Violence and Sexual Offences,Sexual offences,Other sexual offences,71,939.264396
2,abuse of position of trust of a sexual nature,Violence and Sexual Offences,Sexual offences,Other sexual offences,73,240.649045
3,aggravated burglary business and community,Burglary,Theft offences,Non-domestic burglary,31A,
4,aggravated burglary in a building other than a...,Burglary,Theft offences,Non-domestic burglary,31,


In [0]:
# add a new column for weights matched by offence code 

weights_frame = weights_frame.set_index('Offence_Code').join(weights[weights.Offence_cd.isin(crime_des_data.Offence_Code.unique().tolist())].set_index('Offence_cd')['Weight'], rsuffix='_cd')

weights_frame.reset_index(inplace=True)

# rename columns
weights_frame.columns = ['Offence_code','Offence_Description','Policeuk_Cat','Offence_Group','Offence_Subgroup','Weight','Weight_cd']

In [16]:
# fill NaNs with 0
weights_frame.fillna(0, inplace=True)

# combine weights columns into one 
weights_frame['Weight'] = weights_frame.apply(lambda x: x['Weight'] + x['Weight_cd'], axis=1)

# drop the weights_cd column (now redundant as we've combined weights)
weights_frame.drop('Weight_cd', axis=1, inplace=True)

# check
weights_frame.head()

Unnamed: 0,Offence_code,Offence_Description,Policeuk_Cat,Offence_Group,Offence_Subgroup,Weight
0,1/4.1/4.10/4.2,homicide,Violence and Sexual Offences,Violence against the person,Homicide,7973.393423
1,104,assault without injury on a constable,Violence and Sexual Offences,Violence against the person,Violence without injury,15.087613
2,105A,assault without injury,Violence and Sexual Offences,Violence against the person,Violence without injury,27.021083
3,105B,racially or religiously aggravated assault wit...,Violence and Sexual Offences,Violence against the person,Violence without injury,61.526472
4,106,modern slavery,Violence and Sexual Offences,Violence against the person,Violence without injury,2138.128205


In [17]:
# unmatched crime categories 

weights_frame[weights_frame.Weight == 0]

Unnamed: 0,Offence_code,Offence_Description,Policeuk_Cat,Offence_Group,Offence_Subgroup,Weight
33,28A,burglary in a dwelling(outcome only),Burglary,Theft offences,Domestic burglary,0.0
43,30A,burglary in a building other than a dwelling (...,Burglary,Theft offences,Non-domestic burglary,0.0
131,AF,fraud offences recorded by action fraud,Other crime,Fraud offences,Fraud: action fraud,0.0
132,CIFAS,fraud offences recorded by cifas,Other crime,Fraud offences,Fraud: CIFAS,0.0
133,FFA UK,fraud offences recorded by financial fraud act...,Other crime,Fraud offences,Fraud: Financial Fraud Action UK,0.0
134,UK Finance,fraud offences recorded by uk finance,Other crime,Fraud offences,Fraud: UK Finance,0.0


In [18]:
# the two burglary categories are due to shared crime codes which I can explode into individual rows

# we'll hack our way to setting these values

weights_frame[(weights_frame.Offence_Description.str.contains('burg')) & (weights_frame.Weight == 0)]

Unnamed: 0,Offence_code,Offence_Description,Policeuk_Cat,Offence_Group,Offence_Subgroup,Weight
33,28A,burglary in a dwelling(outcome only),Burglary,Theft offences,Domestic burglary,0.0
43,30A,burglary in a building other than a dwelling (...,Burglary,Theft offences,Non-domestic burglary,0.0


In [0]:
# this will set value for offence code 28A
# and will generate a warning

weights_frame.loc[weights_frame.Offence_code == '28A','Weight'] = weights[weights.Offence_cd.str.contains('28A')]['Weight'].tolist()[0]

In [0]:
# this will set value for offence code 30A

weights_frame.loc[weights_frame.Offence_code == '30A','Weight'] = weights[weights.Offence_cd.str.contains('28A')]['Weight'].tolist()[0]

In [21]:
# unmatched crime categories 

weights_frame[weights_frame.Weight == 0]

Unnamed: 0,Offence_code,Offence_Description,Policeuk_Cat,Offence_Group,Offence_Subgroup,Weight
131,AF,fraud offences recorded by action fraud,Other crime,Fraud offences,Fraud: action fraud,0.0
132,CIFAS,fraud offences recorded by cifas,Other crime,Fraud offences,Fraud: CIFAS,0.0
133,FFA UK,fraud offences recorded by financial fraud act...,Other crime,Fraud offences,Fraud: Financial Fraud Action UK,0.0
134,UK Finance,fraud offences recorded by uk finance,Other crime,Fraud offences,Fraud: UK Finance,0.0


In [22]:
# for now we'll just calculate the mean weight of all fraud crimes

weights[weights.Offence_Description.str.contains('fraud')].Weight.mean()

116.91071432167321

In [0]:
# this will set value for all remaining 0 weight crimes shown above

weights_frame.loc[weights_frame.Weight == 0,'Weight'] = weights[weights.Offence_Description.str.contains('fraud')].Weight.mean()

In [24]:
# unmatched crime categories 

weights_frame[weights_frame.Weight == 0]

Unnamed: 0,Offence_code,Offence_Description,Policeuk_Cat,Offence_Group,Offence_Subgroup,Weight


In [0]:
weights_frame = weights_frame[['Policeuk_Cat', 'Offence_Group','Offence_Subgroup','Offence_Description','Offence_code', 'Weight']]

In [26]:
weights_frame

Unnamed: 0,Policeuk_Cat,Offence_Group,Offence_Subgroup,Offence_Description,Offence_code,Weight
0,Violence and Sexual Offences,Violence against the person,Homicide,homicide,1/4.1/4.10/4.2,7973.393423
1,Violence and Sexual Offences,Violence against the person,Violence without injury,assault without injury on a constable,104,15.087613
2,Violence and Sexual Offences,Violence against the person,Violence without injury,assault without injury,105A,27.021083
3,Violence and Sexual Offences,Violence against the person,Violence without injury,racially or religiously aggravated assault wit...,105B,61.526472
4,Violence and Sexual Offences,Violence against the person,Violence without injury,modern slavery,106,2138.128205
5,Possession of Weapons,Possession of weapons offences,Possession of weapons offences,possession of firearms with intent,10A,1233.120179
6,Possession of Weapons,Possession of weapons offences,Possession of weapons offences,possession of firearms offences,10B,824.404789
7,Possession of Weapons,Possession of weapons offences,Possession of weapons offences,possession of other weapons,10C,104.252194
8,Possession of Weapons,Possession of weapons offences,Possession of weapons offences,possession of article with blade or point,10D,106.076478
9,Violence and Sexual Offences,Violence against the person,Violence without injury,cruelty to children/young persons,11A,277.432663


In [0]:
weights_frame.to_csv('crime_des_CSSweights.csv')