In [None]:
from util.preprocessing import load_and_preprocess_data

train_df = load_and_preprocess_data("Twitter_data/traindata7.csv")
test_df = load_and_preprocess_data("Twitter_data/testdata7.csv")

print("Train data:")
train_df.show(10)
print("\nTest data:")
test_df.show(10)

Downloading required NLTK data...
Train data:
+--------------------+---------+
|              Phrase|Sentiment|
+--------------------+---------+
|wishin i could go...|        0|
|@ verizon i'm hav...|        0|
|please don't beli...|        0|
|please sort out a...|        0|
|feature fix the e...|        0|
|disrespectful. an...|        0|
|i think the game ...|        0|
|fuck hell, you th...|        0|
|@ jukinmedia yout...|        0|
|omgggg guy very b...|        0|
+--------------------+---------+
only showing top 10 rows


Test data:
+--------------------+---------+
|              Phrase|Sentiment|
+--------------------+---------+
|cold war black op...|        0|
|so add a fucking ...|        0|
|this be the bad @...|        0|
|'s liberal regres...|        0|
|so when i try to ...|        0|
|my first run a an...|        0|
|i'm still not buy...|        0|
|fuck verizon. the...|        0|
|news: pubg mobile...|        0|
|4 hey rhandlerr r...|        0|
+--------------------+---

In [None]:
# create entry points to spark
from pyspark.sql import SparkSession

ss  = SparkSession.builder \
                            .master("local[1]")\
                            .appName("SparkByExamples.com")\
                            .getOrCreate()
spark = ss.sparkContext

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer, NGram, HashingTF, VectorAssembler, Word2Vec, LinearSVC, OneVsRest
from pyspark.sql.functions import col
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Baseline: TFIDF, Logistic regression

In [None]:
tokenizer = Tokenizer(inputCol="Phrase", outputCol="words")
train_words = tokenizer.transform(train_df)
test_words = tokenizer.transform(test_df)

# HashingTF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
train_featurized = hashingTF.transform(train_words)
test_featurized = hashingTF.transform(test_words)

# IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(train_featurized)
train_rescaled = idfModel.transform(train_featurized)
test_rescaled = idfModel.transform(test_featurized)

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="Sentiment", maxIter=20)
lr_model = lr.fit(train_rescaled)

In [None]:
predictions = lr_model.transform(test_rescaled)
predictions.select("Phrase", "Sentiment", "prediction").show(10)

+--------------------+---------+----------+
|              Phrase|Sentiment|prediction|
+--------------------+---------+----------+
|cold war black op...|        0|       0.0|
|so add a fucking ...|        0|       2.0|
|this be the bad @...|        0|       1.0|
|'s liberal regres...|        0|       3.0|
|so when i try to ...|        0|       0.0|
|my first run a an...|        0|       3.0|
|i'm still not buy...|        0|       1.0|
|fuck verizon. the...|        0|       2.0|
|news: pubg mobile...|        0|       0.0|
|4 hey rhandlerr r...|        0|       0.0|
+--------------------+---------+----------+
only showing top 10 rows



In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="Sentiment", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.4106


In [None]:
#lr_model.save("/model_baseline")

Word2Vec, SVM(One vs Rest)

In [None]:
tokenizer = Tokenizer(inputCol="Phrase", outputCol="words")
train_words = tokenizer.transform(train_df)
test_words = tokenizer.transform(test_df)

# Word2Vec
word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="words", outputCol="features")
w2v_model = word2vec.fit(train_words)
train_w2v = w2v_model.transform(train_words)
test_w2v = w2v_model.transform(test_words)

In [None]:
from pyspark.ml.classification import LinearSVC, OneVsRest

# LinearSVC (SVM)
svm = LinearSVC(featuresCol="features", labelCol="Sentiment", maxIter=20)

# OneVsRest for multiclass SVM
ovr = OneVsRest(classifier=svm, labelCol="Sentiment", featuresCol="features")
ovr_model = ovr.fit(train_w2v)

In [None]:
predictions = ovr_model.transform(test_w2v)

evaluator = MulticlassClassificationEvaluator(
    labelCol="Sentiment", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.2947


In [None]:
#lr_model.save("/model_w2v_svm")

TFIDF, SVM

In [None]:
tokenizer = Tokenizer(inputCol="Phrase", outputCol="words")
train_words = tokenizer.transform(train_df)
test_words = tokenizer.transform(test_df)

# HashingTF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
train_featurized = hashingTF.transform(train_words)
test_featurized = hashingTF.transform(test_words)

# IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(train_featurized)
train_rescaled = idfModel.transform(train_featurized)
test_rescaled = idfModel.transform(test_featurized)

In [None]:
# LinearSVC (SVM)
svm = LinearSVC(featuresCol="features", labelCol="Sentiment", maxIter=20)

# OneVsRest for multiclass SVM
ovr = OneVsRest(classifier=svm, labelCol="Sentiment", featuresCol="features")
ovr_model = ovr.fit(train_rescaled)

In [None]:
predictions = ovr_model.transform(test_rescaled)

evaluator = MulticlassClassificationEvaluator(
    labelCol="Sentiment", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.4358


In [None]:
#lr_model.save("/model_tfidf_svm")