In [2]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, PCA
from pyspark.ml.clustering import KMeans, LDA
from pyspark.ml import Pipeline
from pyspark.sql.functions import lower, regexp_replace, col

In [3]:
spark = SparkSession \
        .builder \
        .appName("MisinfromationClustering") \
        .getOrCreate()

In [4]:
# Load data
data = spark.read.csv("boomlive_data_cleaned.csv", header=True, inferSchema=True)

data.show(5)

+----------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+----+-----+
|  Category|                Link|             Heading|         Sub_heading|              Author|               Date|               Claim|          Fact_check|       Claim_summary|          Claimed_by|Fact_check_summary|               Links|Year|Month|
+----------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+----+-----+
|fact check|https://www.booml...|bangladeshi actor...|according to loca...|    Archis Chowdhury|2024-10-10 14:16:00|video shows an am...|boom identified t...|video shows an am...|  social media users|             false|['https://archive...|2024

In [5]:
# Get the 2024 data
data_2024 = data.filter(data['year'] == 2024)

data_2024.show(5)

+----------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+----+-----+
|  Category|                Link|             Heading|         Sub_heading|              Author|               Date|               Claim|          Fact_check|       Claim_summary|          Claimed_by|Fact_check_summary|               Links|Year|Month|
+----------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+----+-----+
|fact check|https://www.booml...|bangladeshi actor...|according to loca...|    Archis Chowdhury|2024-10-10 14:16:00|video shows an am...|boom identified t...|video shows an am...|  social media users|             false|['https://archive...|2024

**1. Data Preprocessing**

In [6]:
# Select the 'Claim' column and drop rows with NaN values
claims = data_2024.select("Claim").na.drop()

claims.show(5)

# Convert text to lowercase and remove punctuation
claims = claims.withColumn("Claim", lower(regexp_replace(col("Claim"), "[^a-zA-Z\\s]", "")))

# Custom stopwords specific to misinformation data
custom_stopwords = {'claim', 'false', 'video', 'social', 'image', 'media', 'online', 'showing', 'shows', 'show', 'viral',
                    'share', 'shared'}

# Union the default Spark stop words with custom stopwords
default_stopwords = StopWordsRemover.loadDefaultStopWords("english")
all_stopwords = list(set(default_stopwords).union(custom_stopwords))

# Tokenize the claims column
tokenizer = Tokenizer(inputCol="Claim", outputCol="words")
claims_tokenized = tokenizer.transform(claims)

# Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=all_stopwords)
claims_filtered = remover.transform(claims_tokenized)

# Show sample output
claims_filtered.select("Claim", "filtered_words").show(10, truncate=False)


+--------------------+
|               Claim|
+--------------------+
|video shows an am...|
|an image showing ...|
|video shows a hin...|
|tv actress in leb...|
|a video showing p...|
+--------------------+
only showing top 5 rows

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Claim                                                                                                                                                              

**2. TF-IDF Vectorizer**

In [7]:
# Vectorize words
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")

# Compute TF-IDF
idf = IDF(inputCol="raw_features", outputCol="features")

**3. PCA for dimensionality Reduction**

In [8]:
# Apply PCA to reduce dimensionality
pca = PCA(k=346, inputCol="features", outputCol="pca_features")

# Apply K-means clustering
kmeans = KMeans(featuresCol="pca_features", k=4, seed=42)

In [9]:
# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, pca, kmeans])

# Fit the model
model = pipeline.fit(claims)

# Transform data with clustering
clustered_data = model.transform(claims)


In [10]:
# Show clustering results
clustered_data.select("claim", "prediction").show(10)

+--------------------+----------+
|               claim|prediction|
+--------------------+----------+
|video shows an am...|         0|
|an image showing ...|         1|
|video shows a hin...|         0|
|tv actress in leb...|         1|
|a video showing p...|         1|
|a viral video cla...|         0|
|a video of a bike...|         1|
|video shows pakis...|         1|
|a viral video sho...|         0|
|a cctv video show...|         1|
+--------------------+----------+
only showing top 10 rows



In [11]:
clustered_data.groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  319|
|         3|    1|
|         2|    4|
|         0|   92|
+----------+-----+



In [12]:
clustered_data.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|               Claim|               words|      filtered_words|        raw_features|            features|        pca_features|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|video shows an am...|[video, shows, an...|[american, woman,...|(2869,[31,42,166,...|(2869,[31,42,166,...|[-0.0883604921913...|         0|
|an image showing ...|[an, image, showi...|[beef, biryani, s...|(2869,[7,25,246,3...|(2869,[7,25,246,3...|[-0.0490773087101...|         1|
|video shows a hin...|[video, shows, a,...|[hindu, man, kill...|(2869,[14,19,42,7...|(2869,[14,19,42,7...|[-0.1213050853112...|         0|
|tv actress in leb...|[tv, actress, in,...|[tv, actress, leb...|(2869,[165,276,55...|(2869,[165,276,55...|[-0.0388893821969...|         1|
|a video showing p...|[a, v

In [13]:
# show claims in each cluster
clustered_data.filter(clustered_data['prediction'] == 0).select("claim", "filtered_words", "prediction").show(5, truncate=False)
clustered_data.filter(clustered_data['prediction'] == 1).select("claim", "filtered_words", "prediction").show(5, truncate=False)
clustered_data.filter(clustered_data['prediction'] == 2).select("claim", "filtered_words", "prediction").show(5, truncate=False)
clustered_data.filter(clustered_data['prediction'] == 3).select("claim", "filtered_words", "prediction").show(5, truncate=False)



+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|claim                                                                                                                                                                                                                                                                                                                    |filtered_words                                                                                                   

In [None]:
# Apply LDA for topic modeling within each cluster
lda = LDA(k=4, maxIter=5, featuresCol="pca_features")
lda_model = lda.fit(clustered_data)

# Describe topics
topics = lda_model.describeTopics(5)
topics.show(truncate=False)

LDA is taking too long to execute.

### Lets work on August Data

In [15]:
# get the August data
data_august = data_2024.filter(data['month'] == 8)
data_august.show(5)

+----------+--------------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+----+-----+
|  Category|                Link|             Heading|         Sub_heading|            Author|               Date|               Claim|          Fact_check|       Claim_summary|          Claimed_by|Fact_check_summary|               Links|Year|Month|
+----------+--------------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+----+-----+
|fact check|https://www.booml...|video of children...|boom found that t...|     Rohit Kumar ||2024-08-31 13:30:00|a viral video pur...|boom found that t...|a viral video pur...|  social media posts|             false|['https://x.com/B...|2024|    8|


In [17]:
# Select the 'Claim' column and drop rows with NaN values
claims_august = data_august.select("Claim").na.drop()

claims_august.show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Claim                                                                                                                                                                                                                                                                                                                                           |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [19]:
# Convert text to lowercase and remove punctuation
claims_august = claims_august.withColumn("Claim", lower(regexp_replace(col("Claim"), "[^a-zA-Z\\s]", "")))

# Tokenize the claims column
claims_tokenized_august = tokenizer.transform(claims_august)

# Remove stopwords
claims_filtered_august = remover.transform(claims_tokenized_august)

# show sample output
claims_filtered_august.select("Claim", "filtered_words").show(10, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Claim                                                                                                                                                                                                                                                                                                                                     |filtered_words                                                                           

In [20]:
# vectorize words
claims_vectorized_august = vectorizer.fit(claims_filtered_august).transform(claims_filtered_august)

# Compute TF-IDF
claims_idf_august = idf.fit(claims_vectorized_august).transform(claims_vectorized_august)

In [24]:
# Apply PCA to reduce dimensionality with number of components = 34 
pca = PCA(k=34, inputCol="features", outputCol="pca_features")
claims_pca_august = pca.fit(claims_idf_august).transform(claims_idf_august)

# Apply K-means clustering
kmeans = KMeans(featuresCol="pca_features", k=4, seed=42)
model_august = kmeans.fit(claims_pca_august)

# Transform data with clustering
clustered_data_august = model_august.transform(claims_pca_august)

In [25]:
# Show clustering results
clustered_data_august.select("claim", "prediction").show(10)

+--------------------+----------+
|               claim|prediction|
+--------------------+----------+
|a viral video pur...|         0|
|video shows rss m...|         0|
|a viral social me...|         0|
|an image showing ...|         2|
|video posted by b...|         0|
|a second graphic ...|         0|
|video shows badla...|         0|
|a viral whatsapp ...|         0|
|a social media po...|         0|
|a video of virat ...|         0|
+--------------------+----------+
only showing top 10 rows



In [26]:
# Show the number of claims in each cluster
clustered_data_august.groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|    1|
|         3|    1|
|         2|    1|
|         0|   36|
+----------+-----+



In [27]:
# Apply k means clustering with k = 2
kmeans = KMeans(featuresCol="pca_features", k=2, seed=42)
model_august = kmeans.fit(claims_pca_august)

# Transform data with clustering
clustered_data_august = model_august.transform(claims_pca_august)

# Show clustering results
clustered_data_august.select("claim", "prediction").show(10)

# Show the number of claims in each cluster
clustered_data_august.groupBy("prediction").count().show()

+--------------------+----------+
|               claim|prediction|
+--------------------+----------+
|a viral video pur...|         0|
|video shows rss m...|         0|
|a viral social me...|         0|
|an image showing ...|         0|
|video posted by b...|         0|
|a second graphic ...|         0|
|video shows badla...|         0|
|a viral whatsapp ...|         0|
|a social media po...|         0|
|a video of virat ...|         0|
+--------------------+----------+
only showing top 10 rows

+----------+-----+
|prediction|count|
+----------+-----+
|         1|    1|
|         0|   38|
+----------+-----+



In [28]:
clustered_data_august.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|               Claim|               words|      filtered_words|        raw_features|            features|        pca_features|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|a viral video pur...|[a, viral, video,...|[purportedly, thr...|(403,[16,50,66,72...|(403,[16,50,66,72...|[-0.0915128380287...|         0|
|video shows rss m...|[video, shows, rs...|[rss, march, hary...|(403,[2,64,65,247...|(403,[2,64,65,247...|[-0.0052844992797...|         0|
|a viral social me...|[a, viral, social...|[post, whatsapp, ...|(403,[2,5,11,17,1...|(403,[2,5,11,17,1...|[0.38473107388179...|         0|
|an image showing ...|[an, image, showi...|[individual, cutt...|(403,[3,6,8,9,12,...|(403,[3,6,8,9,12,...|[8.83166495984445...|         0|
|video posted by b...|[vide