In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [60]:
raw_df_19_20 = pd.read_csv("raw_data/2019-20-fullyr-data_sa_crime.csv") # returns 95702 entries
raw_df_18_19 = pd.read_csv("raw_data/2018-19-data_sa_crime.csv")        # returns 94937 entries
raw_df_17_18 = pd.read_csv("raw_data/2017-18-data_sa_crime.csv")        # returns 90655 entries
raw_df_16_17 = pd.read_csv("raw_data/2016-17-data_sa_crime.csv")        # returns 90682 entries
raw_df_15_16 = pd.read_csv("raw_data/2015-16-data_sa_crime.csv")        # returns 93167 entries

In [61]:
# combining dataframes
frames = [raw_df_15_16 , raw_df_16_17 , raw_df_17_18 , raw_df_18_19 , raw_df_19_20]
crime_combined = pd.concat(frames)

In [62]:
# look for missing values
crime_combined.count()

Reported Date                  465143
Suburb - Incident              464056
Postcode - Incident            463212
Offence Level 1 Description    465143
Offence Level 2 Description    465143
Offence Level 3 Description    465143
Offence count                  465143
dtype: int64

In [63]:
# drop missing values and save to clean_df
crime_cleaned = crime_combined.dropna(how='any')
crime_cleaned.count()

Reported Date                  463192
Suburb - Incident              463192
Postcode - Incident            463192
Offence Level 1 Description    463192
Offence Level 2 Description    463192
Offence Level 3 Description    463192
Offence count                  463192
dtype: int64

In [64]:
# check for null values
crime_cleaned.isnull().sum()

Reported Date                  0
Suburb - Incident              0
Postcode - Incident            0
Offence Level 1 Description    0
Offence Level 2 Description    0
Offence Level 3 Description    0
Offence count                  0
dtype: int64

In [65]:
# check for duplicates
crime_cleaned.duplicated().sum()

0

In [66]:
crime_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 463192 entries, 0 to 95699
Data columns (total 7 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Reported Date                463192 non-null  object 
 1   Suburb - Incident            463192 non-null  object 
 2   Postcode - Incident          463192 non-null  object 
 3   Offence Level 1 Description  463192 non-null  object 
 4   Offence Level 2 Description  463192 non-null  object 
 5   Offence Level 3 Description  463192 non-null  object 
 6   Offence count                463192 non-null  float64
dtypes: float64(1), object(6)
memory usage: 28.3+ MB


In [67]:
postcodes_df = pd.read_csv("raw_data/australian_postcodes.csv")
postcodes_df.head()

Unnamed: 0,id,postcode,locality,state,long,lat,dc,type,status,sa3,...,SA3_NAME_2016,SA4_CODE_2016,SA4_NAME_2016,RA_2011,RA_2016,MMM_2015,MMM_2019,ced,altitude,chargezone
0,230,200,ANU,ACT,149.119,-35.2777,,,,,...,North Canberra,801.0,Australian Capital Territory,1.0,1.0,1.0,1.0,,,N2
1,21820,200,Australian National University,ACT,149.1189,-35.2777,,,Added 19-Jan-2020,,...,North Canberra,801.0,Australian Capital Territory,1.0,1.0,1.0,1.0,,,N2
2,232,800,DARWIN,NT,130.83668,-12.458684,,,Updated 6-Feb-2020,70101.0,...,Darwin City,701.0,Darwin,3.0,3.0,2.0,2.0,,,NT1
3,233,801,DARWIN,NT,130.83668,-12.458684,,,Updated 25-Mar-2020 SA3,70101.0,...,Darwin City,701.0,Darwin,3.0,3.0,2.0,2.0,,,NT1
4,234,804,PARAP,NT,130.873315,-12.428017,,,Updated 25-Mar-2020 SA3,70102.0,...,Darwin City,701.0,Darwin,3.0,3.0,2.0,2.0,,,NT1


In [68]:
postcodes_df.columns

Index(['id', 'postcode', 'locality', 'state', 'long', 'lat', 'dc', 'type',
       'status', 'sa3', 'sa3name', 'sa4', 'sa4name', 'region', 'Lat_precise',
       'Long_precise', 'SA1_MAINCODE_2011', 'SA1_MAINCODE_2016',
       'SA2_MAINCODE_2016', 'SA2_NAME_2016', 'SA3_CODE_2016', 'SA3_NAME_2016',
       'SA4_CODE_2016', 'SA4_NAME_2016', 'RA_2011', 'RA_2016', 'MMM_2015',
       'MMM_2019', 'ced', 'altitude', 'chargezone'],
      dtype='object')

In [69]:
sa_postcodes = postcodes_df[postcodes_df["state"]== "SA"]

In [70]:
sa_postcodes_clean = sa_postcodes[["locality", "long", "lat"]]

In [71]:
sa_postcodes_clean[sa_postcodes_clean["locality"]=="ABERFOYLE PARK"]

Unnamed: 0,locality,long,lat
14084,ABERFOYLE PARK,138.603897,-35.061705


In [72]:
crime_cleaned["Suburb - Incident"]

0         ABERFOYLE PARK
1               ADELAIDE
2               ADELAIDE
3               ADELAIDE
4               ADELAIDE
              ...       
95695    WESTBOURNE PARK
95696     WEST HINDMARSH
95697         WEST LAKES
95698     WHYALLA NORRIE
95699    WOODVILLE NORTH
Name: Suburb - Incident, Length: 463192, dtype: object

In [73]:
crime_cleaned = crime_cleaned.rename(columns={
    "Suburb - Incident": "locality"
})

In [74]:
clean_df.head()

Unnamed: 0,Reported Date,locality,Postcode - Incident,Offence Level 1 Description,Offence Level 2 Description,Offence Level 3 Description,Offence count
0,1/07/2015,ABERFOYLE PARK,5159,OFFENCES AGAINST PROPERTY,SERIOUS CRIMINAL TRESPASS,SCT - Non Residence,2.0
1,1/07/2015,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,FRAUD DECEPTION AND RELATED OFFENCES,Obtain benefit by deception,1.0
2,1/07/2015,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,PROPERTY DAMAGE AND ENVIRONMENTAL,Other property damage and environmental,2.0
3,1/07/2015,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,THEFT AND RELATED OFFENCES,Other theft,9.0
4,1/07/2015,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,THEFT AND RELATED OFFENCES,Theft from motor vehicle,1.0


In [75]:
df = pd.merge(crime_cleaned, sa_postcodes_clean, on="locality", how="left")

In [77]:
df["long"].isnull().sum()

4032

In [80]:
df[df["lat"].isnull()]

Unnamed: 0,Reported Date,locality,Postcode - Incident,Offence Level 1 Description,Offence Level 2 Description,Offence Level 3 Description,Offence count,long,lat
194,1/07/2015,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Aggravated sexual assault,5.0,,
477,2/07/2015,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Aggravated sexual assault,4.0,,
478,2/07/2015,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Non-assaultive sexual offences,1.0,,
786,3/07/2015,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Aggravated sexual assault,1.0,,
787,3/07/2015,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Non-aggravated sexual assault,1.0,,
...,...,...,...,...,...,...,...,...,...
544488,29/06/2020,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Aggravated sexual assault,3.0,,
544489,29/06/2020,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Non-aggravated sexual assault,2.0,,
544733,30/06/2020,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Aggravated sexual assault,2.0,,
544734,30/06/2020,NOT DISCLOSED,NOT DISCLOSED,OFFENCES AGAINST THE PERSON,SEXUAL ASSAULT AND RELATED OFFENCES,Non-aggravated sexual assault,1.0,,
