In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.mllib.evaluation import MulticlassMetrics
import re
import time

In [3]:
# Create SparkSession
spark = SparkSession.builder \
    .appName("Modified Naive Bayes") \
    .getOrCreate()

In [4]:
# String processing like split_csv in Scala
def split_csv(line):
    columns = line.split(",")
    if len(columns) > 4:
        # Combine columns starting from position 4 onwards
        columns[3] = ",".join(columns[3:])
    return columns

# Cleaning text
def clean_text(text):
    # Remove URLs using a comprehensive regex
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # Remove excessive punctuation (multiple consecutive special characters)
    text = re.sub(r'\?{2,}', ' ', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Keep mentions and hashtags, remove other special characters
    text = re.sub(r'[^\w\s@#]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [5]:
# Reading data from CSV file
input_path = "/content/drive/MyDrive/Colab Notebooks/Text_Sentiment_Analysis_Spark/tweets10.csv"
rdd = spark.sparkContext.textFile(input_path) \
    .map(split_csv) \
    .map(lambda columns: (
        float(columns[1]),  # Sentiment
        clean_text(columns[3])  # Tweet text cleaned
    ))

In [6]:
print(rdd.take(10))

[(0.0, 'is so sad for my apl friend'), (0.0, 'i missed the new moon trailer'), (1.0, 'omg its already o'), (0.0, 'omgaga im sooo im gunna cry ive been at this dentist since i was suposed just get a crown put on mins'), (0.0, 'i think mi bf is cheating on me t_t'), (0.0, 'or i just worry too much'), (1.0, 'juuuuuuuuuuuuuuuuussssst chillin'), (0.0, 'sunny again work tomorrow tv tonight'), (1.0, 'handed in my uniform today i miss you already'), (1.0, 'hmmmm i wonder how she my number @')]


In [7]:
# Converting RDD to DataFrame
input_dataframe = rdd.toDF(["label", "tweet"])

In [8]:
# Showing data
input_dataframe.describe().show()

+-------+------------------+--------------------+
|summary|             label|               tweet|
+-------+------------------+--------------------+
|  count|           1000000|             1000000|
|   mean|          0.531354|                NULL|
| stddev|0.4990162078541877|                NULL|
|    min|               0.0|# @catherine i wo...|
|    max|               1.0|ž œ ª ª i hate po...|
+-------+------------------+--------------------+



In [9]:
# Showing top 10 rows
input_dataframe.show(10, truncate=False)

+-----+-----------------------------------------------------------------------------------------------------+
|label|tweet                                                                                                |
+-----+-----------------------------------------------------------------------------------------------------+
|0.0  |is so sad for my apl friend                                                                          |
|0.0  |i missed the new moon trailer                                                                        |
|1.0  |omg its already o                                                                                    |
|0.0  |omgaga im sooo im gunna cry ive been at this dentist since i was suposed just get a crown put on mins|
|0.0  |i think mi bf is cheating on me t_t                                                                  |
|0.0  |or i just worry too much                                                                             |
|1.0  |juu

In [10]:
# TF-IDF
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
words_data = tokenizer.transform(input_dataframe)

hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures")
featurized_data = hashing_tf.transform(words_data)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)

In [11]:
rescaled_data.show(1000, truncate=False)

+-----+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
rescaled_data.filter(rescaled_data["tweet"] == "").show(10, truncate=False)
rescaled_data.filter(rescaled_data["words"].isNull()).count()


+-----+-----+-----+-----------+--------+
|label|tweet|words|rawFeatures|features|
+-----+-----+-----+-----------+--------+
+-----+-----+-----+-----------+--------+



0

In [13]:
# Splitting dataset into training data and test data
training_data, test_data = rescaled_data.randomSplit([0.75, 0.25], seed=1234)

In [14]:
training_data.show(10, truncate=False)

+-----+-------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
test_data.show(10, truncate=False)

+-----+--------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [16]:
# Training Naive Bayes model
start_time = time.time()
nb = NaiveBayes(featuresCol="features", labelCol="label")
model = nb.fit(training_data)

In [17]:
predictions = model.transform(test_data)
end_time = time.time()

In [18]:
# Calculating metrics
predictions_and_labels = predictions.select("prediction", "label").rdd.map(lambda row: (float(row[0]), float(row[1])))
metrics = MulticlassMetrics(predictions_and_labels)



In [19]:
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())
print("Accuracy:", metrics.accuracy)
print("F1 Score:", metrics.weightedFMeasure)
print("Execution Time:", end_time - start_time, "seconds")

Confusion Matrix:
[[86547. 30349.]
 [36685. 96222.]]
Accuracy: 0.7316525422032562
F1 Score: <bound method MulticlassMetrics.weightedFMeasure of <pyspark.mllib.evaluation.MulticlassMetrics object at 0x7a1103d08f10>>
Execution Time: 57.071112394332886 seconds


In [20]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- tweet: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [21]:
# Display the first few rows of predictions
predictions.select('label', 'tweet', 'prediction').show(20, truncate=False)


+-----+--------------------------------------------------------------------------------------------------------------------------------------+----------+
|label|tweet                                                                                                                                 |prediction|
+-----+--------------------------------------------------------------------------------------------------------------------------------------+----------+
|0.0  |# @rockchic aw poor you hope its not oinkflu                                                                                          |0.0       |
|0.0  |# cough # runny nose or stuffy nose # sore throat # body aches # headache more and i would have flu                                   |0.0       |
|0.0  |# todays weather sucks and its perfect close pleaseeee i hope they dont keep my on call                                               |0.0       |
|0.0  |###@ i hate the dentist i dont want to go                            