In [1]:
import pyspark

from pyspark.sql.functions import *

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

# Acquisition

- This exercises uses the case.csv, dept.csv, and source.csv files from the san antonio 311 call dataset.

1. Read the case, department, and source data into their own spark dataframes.

In [19]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType(
    [
        StructField("source_id", StringType()),
        StructField("source_username", StringType()),
    ]
)
schema

StructType(List(StructField(source_id,StringType,true),StructField(source_username,StringType,true)))

In [20]:
case = spark.read.csv('case.csv', header=True, inferSchema = True)
case.show(vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616         
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | NO                   
 num_days_late        | -2.0126041

In [21]:
case.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [28]:
dept = spark.read.csv('dept.csv', header=True, inferSchema = True)
dept.show(vertical=True)

-RECORD 0--------------------------------------
 dept_division          | 311 Call Center      
 dept_name              | Customer Service     
 standardized_dept_name | Customer Service     
 dept_subject_to_SLA    | YES                  
-RECORD 1--------------------------------------
 dept_division          | Brush                
 dept_name              | Solid Waste Manag... 
 standardized_dept_name | Solid Waste          
 dept_subject_to_SLA    | YES                  
-RECORD 2--------------------------------------
 dept_division          | Clean and Green      
 dept_name              | Parks and Recreation 
 standardized_dept_name | Parks & Recreation   
 dept_subject_to_SLA    | YES                  
-RECORD 3--------------------------------------
 dept_division          | Clean and Green N... 
 dept_name              | Parks and Recreation 
 standardized_dept_name | Parks & Recreation   
 dept_subject_to_SLA    | YES                  
-RECORD 4-------------------------------

In [14]:
dept.printSchema()

root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)



In [26]:
source = spark.read.csv('source.csv', header = True, inferSchema = True)
source.show(vertical = True)

-RECORD 0-------------------------------
 source_id       | 100137               
 source_username | Merlene Blodgett     
-RECORD 1-------------------------------
 source_id       | 103582               
 source_username | Carmen Cura          
-RECORD 2-------------------------------
 source_id       | 106463               
 source_username | Richard Sanchez      
-RECORD 3-------------------------------
 source_id       | 119403               
 source_username | Betty De Hoyos       
-RECORD 4-------------------------------
 source_id       | 119555               
 source_username | Socorro Quiara       
-RECORD 5-------------------------------
 source_id       | 119868               
 source_username | Michelle San Miguel  
-RECORD 6-------------------------------
 source_id       | 120752               
 source_username | Eva T. Kleiber       
-RECORD 7-------------------------------
 source_id       | 124405               
 source_username | Lori Lara            
-RECORD 8-------

In [43]:
source.printSchema()

root
 |-- source_id: string (nullable = true)
 |-- source_username: string (nullable = true)



- Find shape of all 3 Spark data frames

In [36]:
case.count(), len(case.columns)

(65534, 14)

In [37]:
dept.count(), len(dept.columns)

(39, 4)

In [38]:
source.count(), len(source.columns)

(140, 2)

## 2. Let's see how writing to the local disk works in spark:

- Write the code necessary to store the source data in both `csv` and `json` format, store these as sources_csv and sources_json


- Inspect your folder structure. What do you notice?

In [41]:
source.write.json('data/sources_json', mode = 'overwrite')

In [42]:
source.write.csv('data/sources_csv', mode = 'overwrite')

## 3. Inspect the data in your dataframes. 

- Are the data types appropriate? 

- Write the code necessary to cast the values to the appropriate types.

**Task:** confirm 'case_late' & 'case_closed' are only YES and NO and covert to booleans

In [45]:
case.groupby('case_late', 'case_closed').count().show()

+---------+-----------+-----+
|case_late|case_closed|count|
+---------+-----------+-----+
|       NO|        YES|58268|
|      YES|        YES| 6414|
|       NO|         NO|   20|
|      YES|         NO|  832|
+---------+-----------+-----+



In [53]:
case = case.withColumn('case_late', expr('case_late == "YES"'))\
    .withColumn("case_closed", expr('case_closed == "YES"'))                                         

In [54]:
case.select('case_late', 'case_closed').show(3)     

+---------+-----------+
|case_late|case_closed|
+---------+-----------+
|    false|       true|
|    false|       true|
|    false|       true|
+---------+-----------+
only showing top 3 rows



In [55]:
case.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: boolean (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: boolean (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



**Task:** convert 'case_id' and 'council_district' to a string

In [65]:
case.groupBy('case_id').count().show(5)

+----------+-----+
|   case_id|count|
+----------+-----+
|1014127358|    1|
|1014127828|    1|
|1014128265|    1|
|1014128399|    1|
|1014128427|    1|
+----------+-----+
only showing top 5 rows



In [67]:
case.groupBy('council_district').count().show()

+----------------+-----+
|council_district|count|
+----------------+-----+
|               7| 5228|
|               3| 7573|
|               8| 3250|
|               0|  334|
|               5| 8685|
|               6| 5354|
|               9| 3158|
|               1| 9677|
|              10| 4496|
|               4| 8672|
|               2| 9107|
+----------------+-----+



In [68]:
case = case.withColumn('case_id', col('case_id').cast('string'))

In [58]:
case = case.withColumn('council_district', col('council_district').cast('string'))

In [69]:
case.printSchema()

root
 |-- case_id: string (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: boolean (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: boolean (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = true)



**Task:** rename the 'SLA_due_date' column to 'case_due_date'

In [70]:
case = case.withColumnRenamed('SLA_due_date', 'case_due_date')

In [71]:
case.printSchema()

root
 |-- case_id: string (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- case_due_date: string (nullable = true)
 |-- case_late: boolean (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: boolean (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = true)



**Task:** hande the 3 columns that have dates in them:

'case_opened_date', 'case_closed_date', 'case_due_date'

In [72]:
print('--- Before handling dates')
case.select('case_opened_date', 'case_closed_date', 'case_due_date').show(5)

fmt = 'M/d/yy H:mm'

case = (
    case.withColumn('case_opened_date', to_timestamp('case_opened_date', fmt))
    .withColumn('case_closed_date', to_timestamp('case_closed_date', fmt))
    .withColumn('case_due_date', to_timestamp('case_due_date', fmt))
)

print('--- After handling dates')
case.select('case_opened_date', 'case_closed_date', 'case_due_date').show(5)

--- Before handling dates
+----------------+----------------+-------------+
|case_opened_date|case_closed_date|case_due_date|
+----------------+----------------+-------------+
|     1/1/18 0:42|    1/1/18 12:29| 9/26/20 0:42|
|     1/1/18 0:46|     1/3/18 8:11|  1/5/18 8:30|
|     1/1/18 0:48|     1/2/18 7:57|  1/5/18 8:30|
|     1/1/18 1:29|     1/2/18 8:13| 1/17/18 8:30|
|     1/1/18 1:34|    1/1/18 13:29|  1/1/18 4:34|
+----------------+----------------+-------------+
only showing top 5 rows

--- After handling dates
+-------------------+-------------------+-------------------+
|   case_opened_date|   case_closed_date|      case_due_date|
+-------------------+-------------------+-------------------+
|2018-01-01 00:42:00|2018-01-01 12:29:00|2020-09-26 00:42:00|
|2018-01-01 00:46:00|2018-01-03 08:11:00|2018-01-05 08:30:00|
|2018-01-01 00:48:00|2018-01-02 07:57:00|2018-01-05 08:30:00|
|2018-01-01 01:29:00|2018-01-02 08:13:00|2018-01-17 08:30:00|
|2018-01-01 01:34:00|2018-01-01 13:29:00