In [4]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Analysing London Crime Report").getOrCreate()

In [5]:
data = spark.read.format("csv").option("header","true").load("../datasets/london_crime_by_lsoa.csv")
        


In [7]:
data.printSchema()

root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [8]:
data.count()

13490604

In [9]:
data.limit(10).show()

+---------+----------+--------------------+--------------------+-----+----+-----+
|lsoa_code|   borough|      major_category|      minor_category|value|year|month|
+---------+----------+--------------------+--------------------+-----+----+-----+
|E01001116|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004563|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
|E01001320|    Ealing|  Theft and Handling|         Other Theft|    0|2012|    5|
|E01001342|    Ealing|Violence Against ...|    Offensive Weapon|    0|2010|    7|
|E01002633|  Hounslow|             Robbery|   Personal Property|    0|2013|    4|
|E01003496|    Newham|     Criminal Damage|Criminal Damage T...|    0|2013|    9|
|E01004177|    S

In [10]:
data.dropna()

DataFrame[lsoa_code: string, borough: string, major_category: string, minor_category: string, value: string, year: string, month: string]

In [11]:
data.count()

13490604

In [12]:
data.drop("lsoa_code")

DataFrame[borough: string, major_category: string, minor_category: string, value: string, year: string, month: string]

In [13]:
total_borough = data.select("borough").distinct()

In [14]:
total_borough.show()

+--------------------+
|             borough|
+--------------------+
|             Croydon|
|          Wandsworth|
|              Bexley|
|             Lambeth|
|Barking and Dagenham|
|              Camden|
|           Greenwich|
|              Newham|
|       Tower Hamlets|
|            Hounslow|
|              Barnet|
|              Harrow|
|Kensington and Ch...|
|           Islington|
|               Brent|
|            Haringey|
|             Bromley|
|              Merton|
|         Westminster|
|             Hackney|
+--------------------+
only showing top 20 rows



In [15]:
total_borough.count()

33

In [16]:
hackney_data = data.filter( data["borough"] == "Hackney")

In [17]:
hackney_data.show()

+---------+-------+--------------------+--------------------+-----+----+-----+
|lsoa_code|borough|      major_category|      minor_category|value|year|month|
+---------+-------+--------------------+--------------------+-----+----+-----+
|E01001786|Hackney|     Criminal Damage|Criminal Damage T...|    0|2011|    6|
|E01001794|Hackney|Violence Against ...|          Harassment|    1|2013|    2|
|E01001787|Hackney|     Criminal Damage|Other Criminal Da...|    0|2011|    7|
|E01001738|Hackney|Violence Against ...|        Wounding/GBH|    0|2013|   12|
|E01001807|Hackney|  Theft and Handling|  Other Theft Person|    0|2016|    8|
|E01001733|Hackney|            Burglary|Burglary in a Dwe...|    2|2008|    5|
|E01001806|Hackney|             Robbery|   Business Property|    0|2016|    7|
|E01001734|Hackney|  Theft and Handling|Theft/Taking of P...|    0|2009|   12|
|E01001750|Hackney|               Drugs|    Drug Trafficking|    0|2014|    4|
|E01001828|Hackney|  Theft and Handling|Handling Sto

In [19]:
data_2015_2016 = data.filter(data["year"].isin(["2015","2016"]))

In [20]:
data_2015_2016.sample(fraction=0.1).show()

+---------+--------------------+--------------------+--------------------+-----+----+-----+
|lsoa_code|             borough|      major_category|      minor_category|value|year|month|
+---------+--------------------+--------------------+--------------------+-----+----+-----+
|E01001646|           Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01004177|              Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
|E01004436|      Waltham Forest|Other Notifiable ...|      Going Equipped|    0|2015|    2|
|E01001206|              Ealing|             Robbery|   Personal Property|    0|2015|    7|
|E01002276|            Havering|            Burglary|Burglary in a Dwe...|    1|2016|    8|
|E01001005|             Croydon|  Theft and Handling|  Other Theft Person|    0|2015|    3|
|E01003165|             Lambeth|Violence Against ...|      Other violence|    0|2016|    7|
|E01003084|             Lambeth|               Drugs|    Drug Trafficking|    0|

In [22]:
data_2014 = data.filter(data["year"] >= "2014")

In [23]:
data_2014.show(5)

+---------+---------+--------------------+--------------------+-----+----+-----+
|lsoa_code|  borough|      major_category|      minor_category|value|year|month|
+---------+---------+--------------------+--------------------+-----+----+-----+
|E01001116|  Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646|Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|  Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774|Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004177|   Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
+---------+---------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows

