In [None]:
!pip install pyspark



In [None]:
from util.preprocessing import load_and_preprocess_data

train_df = load_and_preprocess_data("/Twitter_data/traindata7.csv")
test_df = load_and_preprocess_data("/Twitter_data/testdata7.csv")

print("Train data:")
train_df.show(10)
print("\nTest data:")
test_df.show(10)

Train data:
+--------------------+---------+
|              Phrase|Sentiment|
+--------------------+---------+
|wishin i could go...|        0|
|@ verizon i'm hav...|        0|
|please don't beli...|        0|
|please sort out a...|        0|
|feature fix the e...|        0|
|disrespectful. an...|        0|
|i think the game ...|        0|
|fuck hell, you th...|        0|
|@ jukinmedia yout...|        0|
|omgggg guy very b...|        0|
+--------------------+---------+
only showing top 10 rows


Test data:
+--------------------+---------+
|              Phrase|Sentiment|
+--------------------+---------+
|cold war black op...|        0|
|so add a fucking ...|        0|
|this be the bad @...|        0|
|'s liberal regres...|        0|
|so when i try to ...|        0|
|my first run a an...|        0|
|i'm still not buy...|        0|
|fuck verizon. the...|        0|
|news: pubg mobile...|        0|
|4 hey rhandlerr r...|        0|
+--------------------+---------+
only showing top 10 rows



In [None]:
# create entry points to spark
from pyspark.sql import SparkSession

ss  = SparkSession.builder \
                            .master("local[1]")\
                            .appName("SparkByExamples.com")\
                            .getOrCreate()
spark = ss.sparkContext

DFIDF Logistic Regression

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression

tokenizer = Tokenizer(inputCol="Phrase", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

lr = LogisticRegression(featuresCol="features", labelCol="Sentiment", maxIter=20)

pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])

pipeline_model = pipeline.fit(train_df)

In [None]:
predictions = pipeline_model.transform(test_df)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Sentiment", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.4106


In [None]:
pipeline_model.save("pipeline_models/baseline")

In [None]:
from pyspark.ml import PipelineModel

# Prepare new data (must be cleaned with your clean_tweets function!)
new_data = [("I really hate this thing!",)]
columns = ["Phrase"]
new_df = ss.createDataFrame(new_data, columns)

# Clean the new data
from util.preprocessing import clean_tweets
cleaned_new_df = clean_tweets(new_df, text_column="Phrase")

# Predict
predictions = pipeline_model.transform(cleaned_new_df)
predictions.select("Phrase", "prediction").show()


+--------------------+----------+
|              Phrase|prediction|
+--------------------+----------+
|i really hate thi...|       1.0|
+--------------------+----------+



DFIDF, SVM (One vs Rest)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LinearSVC, OneVsRest

tokenizer = Tokenizer(inputCol="Phrase", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
svm = LinearSVC(featuresCol="features", labelCol="Sentiment", maxIter=20)
ovr = OneVsRest(classifier=svm, labelCol="Sentiment", featuresCol="features")

pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, ovr])

pipeline_model = pipeline.fit(train_df)

In [None]:
predictions = pipeline_model.transform(test_df)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Sentiment", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.4358


In [None]:
pipeline_model.save("pipeline_models/dfidf_svm")

In [None]:
Word2Vec, SVM (One vs Rest)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, Word2Vec
from pyspark.ml.classification import LinearSVC, OneVsRest

tokenizer = Tokenizer(inputCol="Phrase", outputCol="words")
word2vec = Word2Vec(inputCol="words", outputCol="features", vectorSize=100, minCount=1)
svm = LinearSVC(featuresCol="features", labelCol="Sentiment", maxIter=20)
ovr = OneVsRest(classifier=svm, labelCol="Sentiment", featuresCol="features")

pipeline = Pipeline(stages=[tokenizer, word2vec, ovr])

pipeline_model = pipeline.fit(train_df)

In [None]:
predictions = pipeline_model.transform(test_df)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Sentiment", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.3048


In [None]:
pipeline_model.save("pipeline_models/word2vec_svm")