In [1]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression


In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()


In [3]:
df = spark.read.csv('full_311', header=True)

In [4]:
df = (df
 .withColumn('SLA_days', df.SLA_days.cast(('float'))))

Use the .randomSplit method to split the 311 data into training and test sets.


In [5]:
train, test = df.randomSplit([.7, .3], seed=123)


In [6]:
train.count(), test.count()

(598416, 256853)

Create a classification model to predict whether a case will be late or not (i.e. predict case_late). Experiment with different combinations of features and different classification algorithms.


In [7]:
df = (df
 .withColumn('is_case_late', F.when(df.case_late == 'false', 0).otherwise(1))
 .drop('case_late'))


In [8]:
df.printSchema()

root
 |-- source_id: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- case_id: string (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- num_days_late: string (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: float (nullable = true)
 |-- case_status: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = true)
 |-- year: string (nullable = true)
 |-- num_hours_late: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)
 |-- source_username: string (nullable = true)
 |-- is_case_late: integer (nullable = false)



In [9]:
rf = RFormula(formula = 'is_case_late ~ dept_division')
df2 = rf.fit(df).transform(df).select('features', 'label')

lr = LogisticRegression()
lr_fit = lr.fit(df2)
lr_fit
training_summary = lr_fit.summary

training_summary.areaUnderROC



0.6704161114262225

In [10]:
rf = RFormula(formula = 'is_case_late ~ service_request_type')
df2 = rf.fit(df).transform(df).select('features', 'label')

lr = LogisticRegression()
lr_fit = lr.fit(df2)
lr_fit
training_summary = lr_fit.summary

training_summary.areaUnderROC


0.8160478084367516

In [11]:
rf = RFormula(formula = 'is_case_late ~ council_district')
df2 = rf.fit(df).transform(df).select('features', 'label')

lr = LogisticRegression()
lr_fit = lr.fit(df2)
lr_fit
training_summary = lr_fit.summary

training_summary.areaUnderROC


0.5381361179973588

In [12]:
df.where(df.SLA_days.isNull()).count()

34

In [13]:
df = df.dropna(subset=('SLA_days'))

In [14]:
rf = RFormula(formula = 'is_case_late ~ SLA_days')
df2 = rf.fit(df).transform(df).select('features', 'label')

lr = LogisticRegression()
lr_fit = lr.fit(df2)
lr_fit
training_summary = lr_fit.summary

training_summary.areaUnderROC

0.6024671326821686

In [15]:
rf = RFormula(formula = 'is_case_late ~ service_request_type + council_district + dept_division + SLA_days')
df2 = rf.fit(df).transform(df).select('features', 'label')

lr = LogisticRegression()
lr_fit = lr.fit(df2)
lr_fit
training_summary = lr_fit.summary

training_summary.areaUnderROC


0.8225516320753564