In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = (
    SparkSession.builder.appName("ModelTraining")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

schema = "polarity FLOAT, id LONG, date_time TIMESTAMP, query STRING, user STRING, text STRING"
IN_PATH = "/home/jovyan/data-sets/sentiment-140-training-data/CLEAN"
OUT_PATH = "/home/jovyan/data-sets/sentiment-140-training-data/MODEL"

spark_reader = spark.read.schema(schema)

df = spark_reader.parquet(IN_PATH)
df = (
    df
    # Remove all numbers
    .withColumn("text", f.regexp_replace(f.col("text"), "[^a-zA-Z']", " "))
    # Remove all double/multiple spaces
    .withColumn("text", f.regexp_replace(f.col("text"), " +", " "))
    # Remove leading and trailing whitespaces
    .withColumn("text", f.trim(f.col("text")))
    # Ensure we don't end up with empty rows
    .filter("text != ''")
)

data = df.select("text", "polarity").coalesce(3).cache()

In [2]:
(training_data, validation_data, test_data) = data.randomSplit([0.98, 0.01, 0.01], seed=2020)

In [3]:
%%time
from pyspark.ml.feature import (
    StopWordsRemover,
    Tokenizer,
    HashingTF,
    IDF,
)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="text", outputCol="words1")
stopwords_remover = StopWordsRemover(
    inputCol="words1",
    outputCol="words2",
    stopWords=StopWordsRemover.loadDefaultStopWords("english")
)
hashing_tf = HashingTF(
    inputCol="words2",
    outputCol="term_frequency",
)
idf = IDF(
    inputCol="term_frequency",
    outputCol="features",
    minDocFreq=5,
)
lr = LogisticRegression(labelCol="polarity")

semantic_analysis_pipeline = Pipeline(
    stages=[tokenizer, stopwords_remover, hashing_tf, idf, lr]
)

#semantic_analysis_model = semantic_analysis_pipeline.fit(training_data)

CPU times: user 158 ms, sys: 36.2 ms, total: 195 ms
Wall time: 464 ms


In [13]:
%%time 
trained_df = semantic_analysis_model.transform(training_data)
val_df = semantic_analysis_model.transform(validation_data)
test_df = semantic_analysis_model.transform(test_data)

trained_df.show()
val_df.show()
test_df.show()


+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                text|polarity|              words1|              words2|      term_frequency|            features|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  ' ' cute hug facey|     0.0|[', ', cute, hug,...|[', ', cute, hug,...|(262144,[23837,92...|(262144,[23837,92...|[8.11128118607166...|[0.48221068785989...|       4.0|
|' ' i didn't mean...|     0.0|[', ', i, didn't,...|[', ', mean, anyt...|(262144,[991,3710...|(262144,[991,3710...|[9.44175669178636...|[0.92914747544691...|       0.0|
|' ' it's a monkey...|     4.0|[', ', it's, a, m...|[', ', monkey, re...|(262144,[36200,60...|(262144,[36200,60...|[9.04815777628548...|[0.85752678521703..

In [6]:
%%time
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="polarity", metricName="accuracy")

accuracy_val = evaluator.evaluate(val_df)
accuracy_test = evaluator.evaluate(test_df)
print("Validation Data:")
print(f"Accuracy: {accuracy_val*100:.5f}%")
print("Testing Data:")
print(f"Accuracy: {accuracy_test*100:.5f}%")

CPU times: user 2.1 ms, sys: 1.41 ms, total: 3.52 ms
Wall time: 16.5 ms


In [9]:
%%time
final_model = semantic_analysis_pipeline.fit(data)

accuracy_test = evaluator.evaluate(final_model.transform(test_data))
print(f"Accuracy: {accuracy_test*100:.5f}%")

Accuracy: 79.44201%
CPU times: user 147 ms, sys: 83.2 ms, total: 231 ms
Wall time: 3min 28s


In [10]:
final_model.save(OUT_PATH)