In [2]:
import pandas as pd
import numpy as np
import sqlite3

In [3]:
raw_df_19_20 = pd.read_csv("2019-20-fullyr-data_sa_crime.csv") # returns 95702 entries  C:\Users\Laura\Desktop\Project-2-Visualisations\Data\raw_data\2019-20-fullyr-data_sa_crime.csv
raw_df_18_19 = pd.read_csv("2018-19-data_sa_crime.csv")        # returns 94937 entries
raw_df_17_18 = pd.read_csv("2017-18-data_sa_crime.csv")        # returns 90655 entries
raw_df_16_17 = pd.read_csv("2016-17-data_sa_crime.csv")        # returns 90682 entries
raw_df_15_16 = pd.read_csv("2015-16-data_sa_crime.csv")        # returns 93167 entries
raw_df_14_15 = pd.read_csv("2014-15-data_sa_crime.csv")        # returns 93167 entries

In [4]:
# combining dataframes
frames = [raw_df_14_15, raw_df_15_16 , raw_df_16_17 , raw_df_17_18 , raw_df_18_19 , raw_df_19_20]
raw_combined_crime = pd.concat(frames)

In [5]:
# look for missing values
raw_combined_crime.count()

Reported Date                  555414
Suburb - Incident              554240
Postcode - Incident            553205
Offence Level 1 Description    555414
Offence Level 2 Description    555414
Offence Level 3 Description    555414
Offence count                  555414
dtype: int64

In [6]:
# drop missing values
clean_crime = raw_combined_crime.dropna(how='any')

In [7]:
# rename cols
clean_crime = clean_crime.rename(columns={
    "Reported Date": "date",
    "Suburb - Incident":"locality",
    "Postcode - Incident": "postcode",
    "Offence Level 1 Description": "offence1",
    "Offence Level 2 Description": "offence2",
    "Offence Level 3 Description": "offence3",
    "Offence count": "count",})

In [8]:
# check for null values
clean_crime.isnull().sum()

date        0
locality    0
postcode    0
offence1    0
offence2    0
offence3    0
count       0
dtype: int64

In [9]:
# check for duplicates
clean_crime.duplicated().sum()

0

In [10]:
clean_crime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 553181 entries, 0 to 95699
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   date      553181 non-null  object 
 1   locality  553181 non-null  object 
 2   postcode  553181 non-null  object 
 3   offence1  553181 non-null  object 
 4   offence2  553181 non-null  object 
 5   offence3  553181 non-null  object 
 6   count     553181 non-null  float64
dtypes: float64(1), object(6)
memory usage: 33.8+ MB


In [11]:
postcodes_df = pd.read_csv("australian_postcodes.csv")
postcodes_df.head()

Unnamed: 0,id,postcode,locality,state,long,lat,dc,type,status,sa3,...,SA3_NAME_2016,SA4_CODE_2016,SA4_NAME_2016,RA_2011,RA_2016,MMM_2015,MMM_2019,ced,altitude,chargezone
0,230,200,ANU,ACT,149.119,-35.2777,,,,,...,North Canberra,801.0,Australian Capital Territory,1.0,1.0,1.0,1.0,,,N2
1,21820,200,Australian National University,ACT,149.1189,-35.2777,,,Added 19-Jan-2020,,...,North Canberra,801.0,Australian Capital Territory,1.0,1.0,1.0,1.0,,,N2
2,232,800,DARWIN,NT,130.83668,-12.458684,,,Updated 6-Feb-2020,70101.0,...,Darwin City,701.0,Darwin,3.0,3.0,2.0,2.0,,,NT1
3,233,801,DARWIN,NT,130.83668,-12.458684,,,Updated 25-Mar-2020 SA3,70101.0,...,Darwin City,701.0,Darwin,3.0,3.0,2.0,2.0,,,NT1
4,234,804,PARAP,NT,130.873315,-12.428017,,,Updated 25-Mar-2020 SA3,70102.0,...,Darwin City,701.0,Darwin,3.0,3.0,2.0,2.0,,,NT1


In [12]:
sa_postcodes = postcodes_df[postcodes_df["state"]=="SA"]
sa_geo_info = sa_postcodes[["locality", "lat","long"]]

In [13]:
sa_geo_info

Unnamed: 0,locality,lat,long
305,AMATA,-26.127662,131.173880
308,ANANGU PITJANTJATJARA YANKUNYTJATJARA,-27.077283,131.205697
314,AYERS RANGE SOUTH,-26.190601,133.268032
322,DE ROSE HILL,-26.556097,133.311196
326,ERNABELLA,-21.949513,131.298809
...,...,...,...
15671,ADELAIDE,-35.120097,139.273782
15672,ADELAIDE,-35.120097,139.273782
15673,REGENCY PARK,-34.860017,138.565906
15674,ADELAIDE AIRPORT,-34.945146,138.530183


In [14]:
# dropping any duplicates for locality
sa_geo_info_cleaned = sa_geo_info.drop_duplicates(subset="locality")

In [15]:
sa_geo_info_cleaned

Unnamed: 0,locality,lat,long
305,AMATA,-26.127662,131.173880
308,ANANGU PITJANTJATJARA YANKUNYTJATJARA,-27.077283,131.205697
314,AYERS RANGE SOUTH,-26.190601,133.268032
322,DE ROSE HILL,-26.556097,133.311196
326,ERNABELLA,-21.949513,131.298809
...,...,...,...
15667,SIMPSON DESERT,-26.678030,137.432565
15668,TODMORDEN,-27.283873,134.827062
15669,WITJIRA,-26.338510,135.675034
15674,ADELAIDE AIRPORT,-34.945146,138.530183


In [16]:
# merging crime data with geo data
df = pd.merge(clean_crime, sa_geo_info_cleaned, on="locality", how="left")

In [18]:
df.head()

Unnamed: 0,date,locality,postcode,offence1,offence2,offence3,count,lat,long
0,1/07/2014,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,FRAUD DECEPTION AND RELATED OFFENCES,Obtain benefit by deception,2.0,-34.937459,138.608637
1,1/07/2014,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,FRAUD DECEPTION AND RELATED OFFENCES,"Other fraud, deception and related offences",1.0,-34.937459,138.608637
2,1/07/2014,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,PROPERTY DAMAGE AND ENVIRONMENTAL,Graffiti,1.0,-34.937459,138.608637
3,1/07/2014,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,PROPERTY DAMAGE AND ENVIRONMENTAL,Other property damage and environmental,2.0,-34.937459,138.608637
4,1/07/2014,ADELAIDE,5000,OFFENCES AGAINST PROPERTY,SERIOUS CRIMINAL TRESPASS,Other unlawful entry with intent,7.0,-34.937459,138.608637


In [63]:
database = "../sa_crime_new.sqlite"
conn = sqlite3.connect(database)

In [64]:
df.to_sql(name="crime_data", con=conn, if_exists="fail")


In [1]:
df

NameError: name 'df' is not defined