In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT, DenseVector
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType, StringType, ArrayType

import numpy as np

In [2]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

23/10/27 00:39:04 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
data = spark.read.csv("data/preprocessed_data.csv", header=True, inferSchema=True, sep=',')
data = data.withColumnRenamed("sentiment","label")

                                                                                

In [6]:
def parser(x):
    if x is None:
        return None
    elements = x.strip('[]').split(' ')
    result = [float(i) for i in elements if i.strip() != '']
    return (result) if result else None

parse_embedding_udf = udf(lambda x: parser(x), ArrayType(FloatType()))
data = data.withColumn("parsed_embeddings", parse_embedding_udf(data["embeddings"]))

In [7]:
data.printSchema()

root
 |-- label: integer (nullable = true)
 |-- word count: integer (nullable = true)
 |-- count_word: integer (nullable = true)
 |-- count_unique_word: integer (nullable = true)
 |-- count_letters: integer (nullable = true)
 |-- count_punctuations: integer (nullable = true)
 |-- count_words_upper: integer (nullable = true)
 |-- count_words_title: integer (nullable = true)
 |-- count_stopwords: integer (nullable = true)
 |-- mean_word_len: double (nullable = true)
 |-- word_unique_percent: double (nullable = true)
 |-- punct_percent: double (nullable = true)
 |-- reviews_pre: string (nullable = true)
 |-- embeddings: string (nullable = true)
 |-- parsed_embeddings: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [8]:
data.show()

+-----+----------+----------+-----------------+-------------+------------------+-----------------+-----------------+---------------+------------------+-------------------+------------------+--------------------+--------------------+--------------------+
|label|word count|count_word|count_unique_word|count_letters|count_punctuations|count_words_upper|count_words_title|count_stopwords|     mean_word_len|word_unique_percent|     punct_percent|         reviews_pre|          embeddings|   parsed_embeddings|
+-----+----------+----------+-----------------+-------------+------------------+-----------------+-----------------+---------------+------------------+-------------------+------------------+--------------------+--------------------+--------------------+
|    1|       307|       307|              208|         1761|                78|                8|               36|            133| 4.739413680781759|  67.75244299674267|25.407166123778502|one reviewer ment...|[ 9.66224596e-02 ...|[0.096

                                                                                

In [9]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

# UDF to convert array to Vector
vector_udf = udf(lambda a: Vectors.dense(a), VectorUDT())
data = data.withColumn("parsed_embeddings_vector", vector_udf(data["parsed_embeddings"]))

In [10]:
data.columns

['label',
 'word count',
 'count_word',
 'count_unique_word',
 'count_letters',
 'count_punctuations',
 'count_words_upper',
 'count_words_title',
 'count_stopwords',
 'mean_word_len',
 'word_unique_percent',
 'punct_percent',
 'reviews_pre',
 'embeddings',
 'parsed_embeddings',
 'parsed_embeddings_vector']

In [11]:
feature_cols = data.columns[1:-4] + data.columns[-1:]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

                                                                                

In [12]:
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [13]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features")
lr_model = lr.fit(train_data)

23/10/27 00:39:29 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


CodeCache: size=131072Kb used=35751Kb max_used=35758Kb free=95320Kb
 bounds [0x00000001069d8000, 0x0000000108cf8000, 0x000000010e9d8000]
 total_blobs=14145 nmethods=12968 adapters=1089
 compilation: disabled (not enough contiguous free space left)


In [14]:
predictions = lr_model.transform(test_data)

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

                                                                                

Accuracy: 0.8322094951718549


In [17]:
model_path = "models/pyspark_LR_model"
lr_model.save(model_path)

                                                                                

In [18]:
spark.stop()