In [0]:
# File location and type
file_location = "/FileStore/tables/baseball-1.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

attendance_binary,previous_attendance,previous_away_team_errors,previous_away_team_hits,previous_away_team_runs,game_type,previous_game_type,previous_home_team_errors,previous_home_team_hits,previous_home_team_runs,game_day,previous_game_day,temperature,wind_speed,sky,previous_game_duration,previous_homewin
0,43683,2,6,2,Night Game,Day Game,0,6,6,Wednesday,Monday,55,24,Overcast,2.933333333,1
0,45785,0,7,2,Night Game,Day Game,0,10,3,Wednesday,Monday,48,7,Unknown,2.8,1
0,48282,0,8,4,Night Game,Day Game,2,4,3,Wednesday,Monday,65,10,Cloudy,3.383333333,0
0,21830,0,9,6,Day Game,Night Game,0,15,11,Wednesday,Tuesday,77,0,In Dome,3.233333333,1
0,49289,2,4,2,Night Game,Day Game,1,1,3,Tuesday,Monday,81,12,Cloudy,2.633333333,1
0,15116,1,7,5,Night Game,Night Game,0,8,3,Tuesday,Monday,72,0,In Dome,2.966666667,0
0,44317,0,17,15,Night Game,Day Game,2,4,0,Tuesday,Monday,70,6,Unknown,3.166666667,0
0,39500,0,5,1,Night Game,Day Game,1,9,4,Tuesday,Sunday,40,7,Sunny,3.033333333,1
0,35067,1,7,4,Night Game,Night Game,2,7,3,Tuesday,Monday,70,8,Cloudy,2.933333333,0
0,44318,0,15,12,Night Game,Day Game,1,8,3,Tuesday,Monday,64,0,In Dome,3.583333333,0


In [0]:
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler,StringIndexer,StandardScaler
from pyspark.ml import Pipeline

In [0]:
#use string indexer to convert categorical columns to hold numerical data
game_type_indexer = StringIndexer(inputCol='game_type',outputCol='game_type_index',handleInvalid='keep')
previous_game_type_indexer = StringIndexer(inputCol='previous_game_type',outputCol='previous_game_type_index',handleInvalid='keep')

game_day_indexer = StringIndexer(inputCol='game_day',outputCol='game_day_index',handleInvalid='keep')
previous_game_day_indexer = StringIndexer(inputCol='previous_game_day',outputCol='previous_game_day_index',handleInvalid='keep')

sky_indexer = StringIndexer(inputCol='sky',outputCol='sky_index',handleInvalid='keep')


In [0]:
# Vector assembler is used to create a vector of input features
assembler = VectorAssembler(inputCols=['game_type_index','previous_game_type_index','game_day_index','previous_game_day_index','sky_index','previous_attendance','previous_away_team_errors','previous_away_team_hits','previous_away_team_runs','previous_home_team_errors','previous_home_team_hits','previous_home_team_runs'],
                            outputCol="features")

In [0]:
# used seed to obtain precise results every time we run the code
train_data,test_data=df.randomSplit([0.8,0.2], seed=12345)

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier

In [0]:
dt_model = DecisionTreeClassifier(labelCol='attendance_binary',maxBins=5000)

In [0]:
pipe = Pipeline(stages=[game_type_indexer,previous_game_type_indexer,game_day_indexer,previous_game_day_indexer,sky_indexer,assembler,dt_model])

In [0]:
fit_model=pipe.fit(train_data)

In [0]:
results = fit_model.transform(test_data)

In [0]:
results.select(['attendance_binary','prediction']).show()

+-----------------+----------+
|attendance_binary|prediction|
+-----------------+----------+
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
+-----------------+----------+
only showing top 20 rows



In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="attendance_binary", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy = ACC_evaluator.evaluate(results)
print("The accuracy of the decision tree classifier is {}".format(accuracy))

The accuracy of the decision tree classifier is 0.814968814968815


In [0]:
from sklearn.metrics import confusion_matrix

In [0]:
y_true = results.select("attendance_binary")
y_true = y_true.toPandas()

y_pred = results.select("prediction")
y_pred = y_pred.toPandas()

cnf_matrix = confusion_matrix(y_true, y_pred)
print("Below is the confusion matrix \n {}".format(cnf_matrix))

Below is the confusion matrix 
 [[171  41]
 [ 48 221]]


In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
lr_model =LogisticRegression(labelCol='attendance_binary')

In [0]:
pipe = Pipeline(stages=[game_type_indexer,previous_game_type_indexer,game_day_indexer,previous_game_day_indexer,sky_indexer,assembler,lr_model])

In [0]:
fit_model=pipe.fit(train_data)

In [0]:
results = fit_model.transform(test_data)

In [0]:
results.select(['attendance_binary','prediction']).show()

+-----------------+----------+
|attendance_binary|prediction|
+-----------------+----------+
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
+-----------------+----------+
only showing top 20 rows



In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="attendance_binary", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy = ACC_evaluator.evaluate(results)
print("The accuracy of the  Logistic Regression is is {}".format(accuracy))

The accuracy of the  Logistic Regression is is 0.841995841995842


In [0]:
y_true = results.select("attendance_binary")
y_true = y_true.toPandas()

y_pred = results.select("prediction")
y_pred = y_pred.toPandas()

cnf_matrix = confusion_matrix(y_true, y_pred)
print("Below is the confusion matrix \n {}".format(cnf_matrix))

Below is the confusion matrix 
 [[177  35]
 [ 41 228]]


In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
rf_model = RandomForestClassifier(labelCol='attendance_binary',numTrees=50)

In [0]:
pipe = Pipeline(stages=[game_type_indexer,previous_game_type_indexer,game_day_indexer,previous_game_day_indexer,sky_indexer,assembler,rf_model])

In [0]:
fit_model=pipe.fit(train_data)

In [0]:
results = fit_model.transform(test_data)

In [0]:
results.select(['attendance_binary','prediction']).show()

+-----------------+----------+
|attendance_binary|prediction|
+-----------------+----------+
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
+-----------------+----------+
only showing top 20 rows



In [0]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="attendance_binary", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy = ACC_evaluator.evaluate(results)
print("The accuracy of the decision tree classifier is {}".format(accuracy))

The accuracy of the decision tree classifier is 0.8523908523908524


In [0]:
y_true = results.select("attendance_binary")
y_true = y_true.toPandas()

y_pred = results.select("prediction")
y_pred = y_pred.toPandas()

cnf_matrix = confusion_matrix(y_true, y_pred)
print("Below is the confusion matrix \n {}".format(cnf_matrix))

Below is the confusion matrix 
 [[177  35]
 [ 36 233]]


In [0]:
from pyspark.ml.classification import LinearSVC

In [0]:

assembler = VectorAssembler(inputCols=['game_type_index','previous_game_type_index','game_day_index','previous_game_day_index','sky_index','previous_attendance','previous_away_team_errors','previous_away_team_hits','previous_away_team_runs','previous_home_team_errors','previous_home_team_hits','previous_home_team_runs'],
                            outputCol="unscaled_features")

In [0]:
scaler = StandardScaler(inputCol="unscaled_features",outputCol="features")

In [0]:
svc_model = LinearSVC(labelCol='attendance_binary')

In [0]:
pipe = Pipeline(stages=[game_type_indexer,previous_game_type_indexer,game_day_indexer,previous_game_day_indexer,sky_indexer,assembler,scaler,svc_model])

In [0]:
fit_model=pipe.fit(train_data)

In [0]:
results = fit_model.transform(test_data)

In [0]:
results.select(['attendance_binary','prediction']).show()

+-----------------+----------+
|attendance_binary|prediction|
+-----------------+----------+
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
|                0|       0.0|
+-----------------+----------+
only showing top 20 rows



In [0]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="attendance_binary", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy = ACC_evaluator.evaluate(results)

In [0]:
print("The accuracy of the model is {}".format(accuracy))

The accuracy of the model is 0.83991683991684


In [0]:
y_true = results.select("attendance_binary")
y_true = y_true.toPandas()

y_pred = results.select("prediction")
y_pred = y_pred.toPandas()

cnf_matrix = confusion_matrix(y_true, y_pred)
print("Below is the confusion matrix: \n {}".format(cnf_matrix))

Below is the confusion matrix: 
 [[177  35]
 [ 42 227]]
