In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

# ------------- #
# Local Imports #
# ------------- #

# importing sys
import sys

# adding 00_helper_files to the system path
sys.path.insert(0, '/Users/qmcbt/codeup-data-science/00_helper_files')

# env containing sensitive access credentials
import env
from env import user, password, host
from env import get_db_url

In [5]:
spark = SparkSession.builder.getOrCreate()

In [6]:
from env import host, user, password

def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

# Reading Data

In [7]:
query = """SELECT * FROM source"""
url = get_db_url("311_data")
source_df = pd.read_sql(query, url)
source_df = spark.createDataFrame(source_df)
source_df.show(4)

                                                                                

+-----+---------+----------------+
|index|source_id| source_username|
+-----+---------+----------------+
|    0|   100137|Merlene Blodgett|
|    1|   103582|     Carmen Cura|
|    2|   106463| Richard Sanchez|
|    3|   119403|  Betty De Hoyos|
+-----+---------+----------------+
only showing top 4 rows



In [8]:
# The .schema attribute shows the data types that Spark has inferred from the source
source_df.schema

StructType([StructField('index', LongType(), True), StructField('source_id', StringType(), True), StructField('source_username', StringType(), True)])

# Writing Data

In [9]:
# for demo purposes
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))

mpg.write.json("data/mpg_json", mode="overwrite")

# like much else in spark, there's multiple ways we could do this:
(
    mpg.write.format("csv")
    .mode("overwrite")
    .option("header", "true")
    .save("data/mpg_csv")
)

                                                                                

# Data Preparation

In [11]:
query = """SELECT * FROM cases"""
url = get_db_url("311_data")
df = pd.read_sql(query, url)
df = spark.createDataFrame(df)

In [12]:
df.show(2, vertical=True)

23/01/19 16:01:36 WARN TaskSetManager: Stage 3 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 3:>                                                          (0 + 0) / 1][Stage 3:>                                                          (0 + 1) / 1]

23/01/19 16:01:41 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 17): Attempting to kill Python Worker


                                                                                

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616         
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | NO                   
 num_days_late        | -2.0126041

# Rename Columns


In [13]:
df = df.withColumnRenamed("SLA_due_date", "case_due_date")


# Correct Data Types

In [14]:
# demonstrating we only have yes/no in each field
df.groupBy("case_closed", "case_late").count().show()

23/01/19 16:01:42 WARN TaskSetManager: Stage 4 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 4:>                                                          (0 + 8) / 8]

+-----------+---------+------+
|case_closed|case_late| count|
+-----------+---------+------+
|         NO|      YES|  6525|
|        YES|      YES| 87978|
|         NO|       NO| 11585|
|        YES|       NO|735616|
+-----------+---------+------+



                                                                                

In [15]:
df = df.withColumn("case_closed", expr('case_closed == "YES"')).withColumn(
    "case_late", expr('case_late == "YES"')
)

df.select("case_closed", "case_late").show(5)

23/01/19 16:01:48 WARN TaskSetManager: Stage 7 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 7:>                                                          (0 + 1) / 1]

23/01/19 16:01:52 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 7 (TID 27): Attempting to kill Python Worker
+-----------+---------+
|case_closed|case_late|
+-----------+---------+
|       true|    false|
|       true|    false|
|       true|    false|
|       true|    false|
|       true|     true|
+-----------+---------+
only showing top 5 rows



                                                                                

In [16]:
df.groupBy("council_district").count().show()

23/01/19 16:01:53 WARN TaskSetManager: Stage 8 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.




+----------------+------+
|council_district| count|
+----------------+------+
|               0|  3830|
|               7| 72445|
|               6| 74095|
|               9| 40916|
|               5|114609|
|               1|119309|
|              10| 62926|
|               3|102706|
|               8| 42345|
|               2|114745|
|               4| 93778|
+----------------+------+



                                                                                

In [17]:
df = df.withColumn("council_district", col("council_district").cast("string"))

In [18]:
print("--- Before handling dates")
df.select("case_opened_date", "case_closed_date", "case_due_date").show(5)

fmt = "M/d/yy H:mm"
df = (
    df.withColumn("case_opened_date", to_timestamp("case_opened_date", fmt))
    .withColumn("case_closed_date", to_timestamp("case_opened_date", fmt))
    .withColumn("case_due_date", to_timestamp("case_opened_date", fmt))
)

print("--- After")
df.select("case_opened_date", "case_closed_date", "case_due_date").show(5)

--- Before handling dates
23/01/19 16:01:57 WARN TaskSetManager: Stage 11 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 11:>                                                         (0 + 1) / 1]

23/01/19 16:02:01 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 11 (TID 37): Attempting to kill Python Worker
+----------------+----------------+-------------+
|case_opened_date|case_closed_date|case_due_date|
+----------------+----------------+-------------+
|     1/1/18 0:42|    1/1/18 12:29| 9/26/20 0:42|
|     1/1/18 0:46|     1/3/18 8:11|  1/5/18 8:30|
|     1/1/18 0:48|     1/2/18 7:57|  1/5/18 8:30|
|     1/1/18 1:29|     1/2/18 8:13| 1/17/18 8:30|
|     1/1/18 1:34|    1/1/18 13:29|  1/1/18 4:34|
+----------------+----------------+-------------+
only showing top 5 rows

--- After


                                                                                

23/01/19 16:02:01 WARN TaskSetManager: Stage 12 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 12:>                                                         (0 + 1) / 1]

23/01/19 16:02:05 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 12 (TID 38): Attempting to kill Python Worker
+-------------------+-------------------+-------------------+
|   case_opened_date|   case_closed_date|      case_due_date|
+-------------------+-------------------+-------------------+
|2018-01-01 00:42:00|2018-01-01 00:42:00|2018-01-01 00:42:00|
|2018-01-01 00:46:00|2018-01-01 00:46:00|2018-01-01 00:46:00|
|2018-01-01 00:48:00|2018-01-01 00:48:00|2018-01-01 00:48:00|
|2018-01-01 01:29:00|2018-01-01 01:29:00|2018-01-01 01:29:00|
|2018-01-01 01:34:00|2018-01-01 01:34:00|2018-01-01 01:34:00|
+-------------------+-------------------+-------------------+
only showing top 5 rows



                                                                                

# Data Transformations

In [19]:
print("--- Before")
df.select("request_address").show(5)

df = df.withColumn("request_address", trim(lower(df.request_address)))

print("--- After")
df.select("request_address").show(5)

--- Before
23/01/19 16:02:05 WARN TaskSetManager: Stage 13 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 13:>                                                         (0 + 1) / 1]

23/01/19 16:02:09 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 13 (TID 39): Attempting to kill Python Worker
+--------------------+
|     request_address|
+--------------------+
|2315  EL PASO ST,...|
|2215  GOLIAD RD, ...|
|102  PALFREY ST W...|
|114  LA GARDE ST,...|
|734  CLEARVIEW DR...|
+--------------------+
only showing top 5 rows

--- After


                                                                                

23/01/19 16:02:10 WARN TaskSetManager: Stage 14 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 14:>                                                         (0 + 1) / 1]

23/01/19 16:02:14 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 14 (TID 40): Attempting to kill Python Worker
+--------------------+
|     request_address|
+--------------------+
|2315  el paso st,...|
|2215  goliad rd, ...|
|102  palfrey st w...|
|114  la garde st,...|
|734  clearview dr...|
+--------------------+
only showing top 5 rows



                                                                                

In [20]:
df = df.withColumn(
    "num_weeks_late", expr("num_days_late / 7 AS num_weeks_late")
)

df.select("num_days_late", "num_weeks_late").show(5)

23/01/19 16:02:15 WARN TaskSetManager: Stage 15 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 15:>                                                         (0 + 1) / 1]

23/01/19 16:02:19 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 15 (TID 41): Attempting to kill Python Worker
+-------------+--------------------+
|num_days_late|      num_weeks_late|
+-------------+--------------------+
| -998.5087616|        -142.6441088|
| -2.012604167|        -0.287514881|
| -3.022337963|-0.43176256614285713|
| -15.01148148| -2.1444973542857144|
|  0.372164352|0.053166335999999995|
+-------------+--------------------+
only showing top 5 rows



                                                                                

In [21]:
df = df.withColumn("council_district", col("council_district").cast("int"))

# '%03d' means at least 3 digits, pad with 0s
#
# In order to use the format_string function the way we are, we'll need to
# convert council_district back to an integer temporarily, but the final output
# will be a string.
df = df.withColumn(
    "council_district",
    format_string("%03d", col("council_district").cast("int")),
)

df.select("council_district").show(5)


23/01/19 16:02:20 WARN TaskSetManager: Stage 16 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 16:>                                                         (0 + 1) / 1]

23/01/19 16:02:24 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 16 (TID 42): Attempting to kill Python Worker
+----------------+
|council_district|
+----------------+
|             005|
|             003|
|             003|
|             003|
|             007|
+----------------+
only showing top 5 rows



                                                                                

# New Features

In [22]:
df = df.withColumn("zipcode", regexp_extract("request_address", r"\d+$", 0))

df.select("zipcode").show(5)

23/01/19 16:02:24 WARN TaskSetManager: Stage 17 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 17:>                                                         (0 + 1) / 1]

23/01/19 16:02:28 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 17 (TID 43): Attempting to kill Python Worker
+-------+
|zipcode|
+-------+
|  78207|
|  78223|
|  78223|
|  78223|
|  78228|
+-------+
only showing top 5 rows



                                                                                

In [23]:
df = (
    df.withColumn(
        "case_age", datediff(current_timestamp(), "case_opened_date")
    )
    .withColumn(
        "days_to_closed", datediff("case_closed_date", "case_opened_date")
    )
    .withColumn(
        "case_lifetime",
        when(expr("! case_closed"), col("case_age")).otherwise(
            col("days_to_closed")
        ),
    )
)

df.select(
    "case_closed",
    "case_opened_date",
    "case_closed_date",
    "case_age",
    "days_to_closed",
    "case_lifetime",
).where(expr("case_closed")).show(5)

df.select(
    "case_closed",
    "case_opened_date",
    "case_closed_date",
    "case_age",
    "days_to_closed",
    "case_lifetime",
).where(expr("! case_closed")).show(5)


23/01/19 16:02:29 WARN TaskSetManager: Stage 18 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 18:>                                                         (0 + 1) / 1]

23/01/19 16:02:33 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 18 (TID 44): Attempting to kill Python Worker
+-----------+-------------------+-------------------+--------+--------------+-------------+
|case_closed|   case_opened_date|   case_closed_date|case_age|days_to_closed|case_lifetime|
+-----------+-------------------+-------------------+--------+--------------+-------------+
|       true|2018-01-01 00:42:00|2018-01-01 00:42:00|    1844|             0|            0|
|       true|2018-01-01 00:46:00|2018-01-01 00:46:00|    1844|             0|            0|
|       true|2018-01-01 00:48:00|2018-01-01 00:48:00|    1844|             0|            0|
|       true|2018-01-01 01:29:00|2018-01-01 01:29:00|    1844|             0|            0|
|       true|2018-01-01 01:34:00|2018-01-01 01:34:00|    1844|             0|            0|
+-----------+-------------------+-------------------+--------+--------------+-------------+
only showing top 5 rows



                                                                                

23/01/19 16:02:33 WARN TaskSetManager: Stage 19 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 19:>                                                         (0 + 1) / 1]

23/01/19 16:02:37 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 19 (TID 45): Attempting to kill Python Worker
+-----------+-------------------+-------------------+--------+--------------+-------------+
|case_closed|   case_opened_date|   case_closed_date|case_age|days_to_closed|case_lifetime|
+-----------+-------------------+-------------------+--------+--------------+-------------+
|      false|2018-01-02 09:39:00|2018-01-02 09:39:00|    1843|             0|         1843|
|      false|2018-01-02 10:49:00|2018-01-02 10:49:00|    1843|             0|         1843|
|      false|2018-01-02 13:45:00|2018-01-02 13:45:00|    1843|             0|         1843|
|      false|2018-01-02 14:09:00|2018-01-02 14:09:00|    1843|             0|         1843|
|      false|2018-01-02 14:34:00|2018-01-02 14:34:00|    1843|             0|         1843|
+-----------+-------------------+-------------------+--------+--------------+-------------+
only showing top 5 rows



                                                                                

# Joining Department Data

In [24]:
query = """SELECT * FROM dept"""
dept = pd.read_sql(query, url)
dept = spark.createDataFrame(dept)
dept.show(4)

+--------------------+--------------------+----------------------+-------------------+
|       dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+--------------------+--------------------+----------------------+-------------------+
|     311 Call Center|    Customer Service|      Customer Service|                YES|
|               Brush|Solid Waste Manag...|           Solid Waste|                YES|
|     Clean and Green|Parks and Recreation|    Parks & Recreation|                YES|
|Clean and Green N...|Parks and Recreation|    Parks & Recreation|                YES|
+--------------------+--------------------+----------------------+-------------------+
only showing top 4 rows



In [25]:
df = (
    df
    # left join on dept_division
    .join(dept, "dept_division", "left")
    # drop all the columns except for standardized name, as it has much fewer unique values
    .drop(dept.dept_division)
    .drop(dept.dept_name)
    .drop(df.dept_division)
    .withColumnRenamed("standardized_dept_name", "department")
    # convert to a boolean
    .withColumn("dept_subject_to_SLA", col("dept_subject_to_SLA") == "YES")
)

df.show(2, vertical=True)


23/01/19 16:02:41 WARN TaskSetManager: Stage 22 contains a task of very large size (18865 KiB). The maximum recommended task size is 1000 KiB.


[Stage 22:>                 (0 + 8) / 8][Stage 23:>                 (0 + 0) / 8]

23/01/19 16:02:46 WARN PythonRunner: Detected deadlock while completing task 4.0 in stage 22 (TID 52): Attempting to kill Python Worker
23/01/19 16:02:46 WARN PythonRunner: Detected deadlock while completing task 7.0 in stage 22 (TID 55): Attempting to kill Python Worker
23/01/19 16:02:46 WARN PythonRunner: Detected deadlock while completing task 3.0 in stage 22 (TID 51): Attempting to kill Python Worker
23/01/19 16:02:46 WARN PythonRunner: Detected deadlock while completing task 1.0 in stage 22 (TID 49): Attempting to kill Python Worker
23/01/19 16:02:46 WARN PythonRunner: Detected deadlock while completing task 2.0 in stage 22 (TID 50): Attempting to kill Python Worker
23/01/19 16:02:46 WARN PythonRunner: Detected deadlock while completing task 6.0 in stage 22 (TID 54): Attempting to kill Python Worker
23/01/19 16:02:46 WARN PythonRunner: Detected deadlock while completing task 5.0 in stage 22 (TID 53): Attempting to kill Python Worker
23/01/19 16:02:46 WARN PythonRunner: Detected de

                                                                                

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 00:42:00  
 case_due_date        | 2018-01-01 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616         
 case_closed          | true                 
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  el paso st,... 
 council_district     | 005                  
 num_weeks_late       | -142.6441088         
 zipcode              | 78207                
 case_age             | 1844                 
 days_to_closed       | 0                    
 case_lifetime        | 0                    
 department           | Animal Care Services 
 dept_subject_to_SLA  | true                 
-RECORD 1-------------------------

# Train Test Split

In [26]:
train, test = df.randomSplit([0.8, 0.2])
#
train, validate, test = df.randomSplit([0.6, 0.2, 0.2])