In [5]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('classification').getOrCreate()

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LinearSVC, NaiveBayes, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [6]:
OOSR = spark.read.csv('OOSR/OOSR.csv', inferSchema=True, header=True)

In [7]:
# Encode
indexer_age_group = StringIndexer(inputCol='Age_group', outputCol='Age_group_index')
indexer_gender = StringIndexer(inputCol='Gender', outputCol='Gender_index')
indexer_region = StringIndexer(inputCol='Region', outputCol='Region_index')

encoder = OneHotEncoder(inputCols=['Age_group_index', 'Gender_index', 'Region_index'],
                        outputCols=['Age_group_encoded', 'Gender_encoded', 'Region_encoded'])

assembler = VectorAssembler(inputCols=['Age_group_encoded', 'Gender_encoded', 'Region_encoded'], outputCol='features')

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

In [8]:
svc = LinearSVC(labelCol="High_OOSR", featuresCol="scaledFeatures")
nb = NaiveBayes(labelCol="High_OOSR", featuresCol="scaledFeatures")
rf = RandomForestClassifier(labelCol="High_OOSR", featuresCol="scaledFeatures")

pipeline_svc = Pipeline(stages=[indexer_age_group, indexer_gender, indexer_region, encoder, assembler, scaler, svc])
pipeline_nb = Pipeline(stages=[indexer_age_group, indexer_gender, indexer_region, encoder, assembler, scaler, nb])
pipeline_rf = Pipeline(stages=[indexer_age_group, indexer_gender, indexer_region, encoder, assembler, scaler, rf])

evaluator = BinaryClassificationEvaluator(labelCol="High_OOSR")

# Cross-validation
paramGrid_svc = ParamGridBuilder().addGrid(svc.maxIter, [10, 20, 30]) \
    .addGrid(svc.regParam, [0.01, 0.1, 1.0]).build()

paramGrid_nb = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.5, 1.0]).build()

paramGrid_rf = ParamGridBuilder().addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]).build()

crossval_svc = CrossValidator(estimator=pipeline_svc,
                              estimatorParamMaps=paramGrid_svc,
                              evaluator=evaluator,
                              numFolds=5)

crossval_nb = CrossValidator(estimator=pipeline_nb,
                             estimatorParamMaps=paramGrid_nb,
                             evaluator=evaluator,
                             numFolds=5)

crossval_rf = CrossValidator(estimator=pipeline_rf,
                             estimatorParamMaps=paramGrid_rf,
                             evaluator=evaluator,
                             numFolds=5)

svc_model = crossval_svc.fit(OOSR)
nb_model = crossval_nb.fit(OOSR)
rf_model = crossval_rf.fit(OOSR)

# Evaluate the model
svc_avg_precision = evaluator.evaluate(svc_model.transform(OOSR))
nb_avg_precision = evaluator.evaluate(nb_model.transform(OOSR))
rf_avg_precision = evaluator.evaluate(rf_model.transform(OOSR))

print("Average precision (SVC):", svc_avg_precision)
print("Average precision (Naive Bayes):", nb_avg_precision)
print("Average precision (Random Forest):", rf_avg_precision)

24/05/23 21:36:07 WARN CacheManager: Asked to cache already cached data.
24/05/23 21:36:07 WARN CacheManager: Asked to cache already cached data.


Average precision (SVC): 0.858876340168627
Average precision (Naive Bayes): 0.8283881544706985
Average precision (Random Forest): 0.5


In [9]:
# Select SVC
svc = LinearSVC(labelCol="High_OOSR", featuresCol="scaledFeatures")
pipeline_svc = Pipeline(stages=[indexer_age_group, indexer_gender, indexer_region, encoder, assembler, scaler, svc])

svc_model = crossval_svc.fit(OOSR)

In [10]:
# Encode
indexer_age_group = StringIndexer(inputCol='Age_group', outputCol='Age_group_index')
indexer_gender = StringIndexer(inputCol='Gender', outputCol='Gender_index')
indexer_region = StringIndexer(inputCol='Region', outputCol='Region_index')

encoder = OneHotEncoder(inputCols=['Age_group_index', 'Gender_index', 'Region_index'],
                        outputCols=['Age_group_encoded', 'Gender_encoded', 'Region_encoded'])

assembler = VectorAssembler(inputCols=['Age_group_encoded', 'Gender_encoded', 'Region_encoded'], outputCol='features')

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

svc = LinearSVC(labelCol="High_OOSR", featuresCol="scaledFeatures")
pipeline_svc = Pipeline(stages=[indexer_age_group, indexer_gender, indexer_region, encoder, assembler, scaler, svc])

# Divide the data set into a training set and a test set
train_data, test_data = OOSR.randomSplit([0.7, 0.3], seed=722)

evaluator = BinaryClassificationEvaluator(labelCol="High_OOSR")

paramGrid_svc = ParamGridBuilder().addGrid(svc.maxIter, [10, 20, 30]) \
    .addGrid(svc.regParam, [0.01, 0.1, 1.0]).build()
crossval_svc = CrossValidator(estimator=pipeline_svc,
                              estimatorParamMaps=paramGrid_svc,
                              evaluator=evaluator,
                              numFolds=5)

svc_model = crossval_svc.fit(train_data)

In [11]:
predictions = svc_model.transform(test_data)
predictions_and_labels = predictions.select("High_OOSR", "prediction")

# Calculate the accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="High_OOSR", predictionCol="prediction", 
                                                       metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions_and_labels)
print("Accuracy on test data: {:.2f}%".format(accuracy * 100))


Accuracy on test data: 93.52%


In [13]:
from pyspark.sql.functions import col

# Confusion matrix
prediction_rdd = predictions_and_labels.select(col("prediction").cast("double"), 
                                               col("High_OOSR").cast("double")).rdd
metrics = MulticlassMetrics(prediction_rdd)
confusion_matrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:\n", confusion_matrix)

Confusion Matrix:
 [[462.   0.]
 [ 32.   0.]]
