In [23]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT, DenseVector
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType, StringType, ArrayType

# from pyspark.ml.classification import XGBoostClassifier
# from pyspark.ml import Pipeline

from xgboost.spark import SparkXGBClassifier

import numpy as np

In [24]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [25]:
spark = SparkSession.builder.appName("XGBoost").getOrCreate()

In [26]:
data = spark.read.csv("../data/preprocessed_data.csv", header=True, inferSchema=True, sep=',')
data = data.withColumnRenamed("sentiment","label")

In [27]:
def parser(x):
    if x is None:
        return None
    elements = x.strip('[]').split(' ')
    result = [float(i) for i in elements if i.strip() != '']
    return (result) if result else None

parse_embedding_udf = udf(lambda x: parser(x), ArrayType(FloatType()))
data = data.withColumn("parsed_embeddings", parse_embedding_udf(data["embeddings"]))

In [30]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

# UDF to convert array to Vector
vector_udf = udf(lambda a: Vectors.dense(a), VectorUDT())
data = data.withColumn("parsed_embeddings_vector", vector_udf(data["parsed_embeddings"]))

In [32]:
feature_cols = data.columns[1:-4] + data.columns[-1:]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

In [33]:
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [46]:
# Create an XGBoost classifier
xgboost = SparkXGBClassifier(
    features_col="features",
    label_col="label",
    prediction_col="prediction",
    eval_metric="logloss",  # Evaluation metric
    max_depth=6
)

In [47]:
# Fit the model to the training data
model = xgboost.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

2023-10-27 13:33:10,256 INFO XGBoost-PySpark: _fit Running xgboost-2.0.1 on 1 workers with
	booster params: {'objective': 'binary:logistic', 'device': 'cpu', 'max_depth': 6, 'eval_metric': 'logloss', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[13:33:13] task 0 got new rank 0                                    (0 + 1) / 1]
2023-10-27 13:33:17,573 INFO XGBoost-PySpark: _fit Finished xgboost training!   


In [48]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

INFO:XGBoost-PySpark:Do the inference on the CPUs
2023-10-27 13:33:22,097 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2023-10-27 13:33:22,304 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2023-10-27 13:33:22,774 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2023-10-27 13:33:22,784 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2023-10-27 13:33:22,804 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2023-10-27 13:33:22,813 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2023-10-27 13:33:22,883 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
                                                                                

AUC: 0.8380035280295774


In [50]:
model_path = "../models/pyspark_XGB_model"
model.save(model_path)