# Tugas Pengenalan Apache Spark

### Initialization
***

In [5]:
import findspark

findspark.init()

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [7]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000000066B6C50>


### Loading datasets
***

In [29]:
df1 = spark.read.csv("C:/Users/Symefa/Desktop/Big-Data/datasets/crash.csv", header=True, inferSchema=True)

df1.count()

895916

### Schema Fix
***

In [42]:
df1.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Crash Descriptor: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Day of Week: string (nullable = true)
 |-- Police Report: string (nullable = true)
 |-- Lighting Conditions: string (nullable = true)
 |-- Municipality: string (nullable = true)
 |-- Collision Type Descriptor: string (nullable = true)
 |-- County Name: string (nullable = true)
 |-- Road Descriptor: string (nullable = true)
 |-- Weather Conditions: string (nullable = true)
 |-- Traffic Control Device: string (nullable = true)
 |-- Road Surface Conditions: string (nullable = true)
 |-- DOT Reference Marker Location: string (nullable = true)
 |-- Pedestrian Bicyclist Action: string (nullable = true)
 |-- Event Descriptor: string (nullable = true)
 |-- Number of Vehicles Involved: integer (nullable = true)



### Converting To SparkSQL Datatype
***

In [43]:
df1.createOrReplaceTempView('crashes')

In [44]:
spark.sql("SELECT DISTINCT `County Name` FROM crashes").show()

+------------+
| County Name|
+------------+
|      FULTON|
|ST. LAWRENCE|
| CATTARAUGUS|
|     STEUBEN|
|       YATES|
|       KINGS|
|      OSWEGO|
|     MADISON|
|   JEFFERSON|
|  CHAUTAUQUA|
| SCHENECTADY|
|      WARREN|
|    ROCKLAND|
|       TIOGA|
|    ALLEGANY|
|      MONROE|
|      SENECA|
|    ONONDAGA|
|       LEWIS|
|      QUEENS|
+------------+
only showing top 20 rows



### Data Mining Process
***

* Banyak Kecelakaan Per County

In [40]:
spark.sql("SELECT `County Name` AS County, COUNT(`County Name`) AS `Number of Accident`\
          FROM crashes \
          GROUP BY `County Name` \
          ORDER BY COUNT(`County Name`) DESC \
          ").show()

+-----------+------------------+
|     County|Number of Accident|
+-----------+------------------+
|     NASSAU|             98984|
|    SUFFOLK|             95967|
|     QUEENS|             59875|
|      KINGS|             58611|
|       ERIE|             51708|
|WESTCHESTER|             44336|
|     MONROE|             42098|
|   NEW YORK|             35752|
|      BRONX|             32411|
|   ONONDAGA|             29183|
|     ORANGE|             28860|
|     ALBANY|             24543|
|   ROCKLAND|             19754|
|   DUTCHESS|             17181|
|   SARATOGA|             14138|
|     ONEIDA|             13215|
|     ULSTER|             12909|
|     BROOME|             11794|
|   RICHMOND|             11486|
|    NIAGARA|             11170|
+-----------+------------------+
only showing top 20 rows



* Banyak Kecelakaan per-Jam

In [None]:
spark.sql("SELECT Time \
          FROM crashes \
          PARTITION BY `Time` \
          ORDER BY COUNT(`County Name`) DESC \
          ").show()