In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SentimentAnalysis") \
    .master("local[*]") \
    .getOrCreate()


In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("TweetID", IntegerType(), True),
    StructField("Entity", StringType(), True),
    StructField("Sentiment", StringType(), True),
    StructField("Content", StringType(), True)
])

df = spark.read.csv("./Spark/twitter_training.csv", header=False, schema=schema)
df.show()


+-------+-----------+---------+--------------------+
|TweetID|     Entity|Sentiment|             Content|
+-------+-----------+---------+--------------------+
|   2401|Borderlands| Positive|im getting on bor...|
|   2401|Borderlands| Positive|I am coming to th...|
|   2401|Borderlands| Positive|im getting on bor...|
|   2401|Borderlands| Positive|im coming on bord...|
|   2401|Borderlands| Positive|im getting on bor...|
|   2401|Borderlands| Positive|im getting into b...|
|   2402|Borderlands| Positive|So I spent a few ...|
|   2402|Borderlands| Positive|So I spent a coup...|
|   2402|Borderlands| Positive|So I spent a few ...|
|   2402|Borderlands| Positive|So I spent a few ...|
|   2402|Borderlands| Positive|2010 So I spent a...|
|   2402|Borderlands| Positive|                 was|
|   2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|   2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|   2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|   2403|Borderlands|  Neutral|Rock-Hard La Vi

In [3]:
df_filtered = df.filter(df["Content"].isNotNull())
df = df.fillna({"Content": "default_value"})

In [4]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Tokenization
tokenizer = Tokenizer(inputCol="Content", outputCol="words")
df_words = tokenizer.transform(df)

# Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df_filtered = remover.transform(df_words)

# TF-IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
featurizedData = hashingTF.transform(df_filtered)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.show()

+-------+-----------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|TweetID|     Entity|Sentiment|             Content|               words|            filtered|         rawFeatures|            features|
+-------+-----------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   2401|Borderlands| Positive|im getting on bor...|[im, getting, on,...|[im, getting, bor...|(262144,[31015,92...|(262144,[31015,92...|
|   2401|Borderlands| Positive|I am coming to th...|[i, am, coming, t...|[coming, borders,...|(262144,[12409,14...|(262144,[12409,14...|
|   2401|Borderlands| Positive|im getting on bor...|[im, getting, on,...|[im, getting, bor...|(262144,[31015,68...|(262144,[31015,68...|
|   2401|Borderlands| Positive|im coming on bord...|[im, coming, on, ...|[im, coming, bord...|(262144,[12409,31...|(262144,[12409,31...|
|   2401|Borderlands| Positive|im getting

In [5]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

indexer = StringIndexer(inputCol="Sentiment", outputCol="label")
df_final = indexer.fit(rescaledData).transform(rescaledData)

In [6]:
from pyspark.ml.classification import LogisticRegression

# Assume df_final is preprocessed DataFrame with features and label
lr = LogisticRegression(featuresCol='features', labelCol='label')
lrModel = lr.fit(df_final)
predictions = lrModel.transform(df_final)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Logistic Regression Accuracy: %f" % accuracy)


Logistic Regression Accuracy: 0.969069


In [7]:
from pyspark.ml.classification import NaiveBayes

# Define Naive Bayes Classifier model
nb = NaiveBayes(featuresCol='features', labelCol='label')

# Fit the model
nbModel = nb.fit(df_final)
predictions = nbModel.transform(df_final)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Naive Bayes Classifier Accuracy: %f" % accuracy)


Naive Bayes Classifier Accuracy: 0.890161
