In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT, DenseVector
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType, StringType, ArrayType

import numpy as np

In [7]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [8]:
spark = SparkSession.builder.appName("RandomForest").getOrCreate()

In [9]:
data = spark.read.csv("../data/preprocessed_data.csv", header=True, inferSchema=True, sep=',')
data = data.withColumnRenamed("sentiment","label")

In [10]:
def parser(x):
    if x is None:
        return None
    elements = x.strip('[]').split(' ')
    result = [float(i) for i in elements if i.strip() != '']
    return (result) if result else None

parse_embedding_udf = udf(lambda x: parser(x), ArrayType(FloatType()))
data = data.withColumn("parsed_embeddings", parse_embedding_udf(data["embeddings"]))

In [13]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

# UDF to convert array to Vector
vector_udf = udf(lambda a: Vectors.dense(a), VectorUDT())
data = data.withColumn("parsed_embeddings_vector", vector_udf(data["parsed_embeddings"]))

In [15]:
feature_cols = data.columns[1:-4] + data.columns[-1:]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

                                                                                

In [16]:
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [17]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)
rf_model = rf.fit(train_data)



CodeCache: size=131072Kb used=33121Kb max_used=33132Kb free=97950Kb
 bounds [0x00000001081d8000, 0x000000010a258000, 0x00000001101d8000]
 total_blobs=12208 nmethods=11201 adapters=920
 compilation: disabled (not enough contiguous free space left)


                                                                                

In [18]:
predictions = rf_model.transform(test_data)

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

                                                                                

Accuracy: 0.7889012392609659


In [22]:
model_path = "../models/pyspark_RF_model"
rf_model.save(model_path)

                                                                                

In [23]:
spark.stop()