# Clustering with Latent Dirichlet Allocation

### Importing Libraries

In [46]:
from sklearn.datasets import fetch_20newsgroups
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.ml.feature import StopWordsRemover, Tokenizer,CountVectorizer,IDF
import re
# Create a Spark session
spark = SparkSession.builder.appName("NewsgroupsPreprocessing").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
num_features = 8000  #vocabulary size
num_topics = 20      

### Load the 20 Newsgroups dataset

In [47]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
# Convert the text data to a DataFrame
df = spark.createDataFrame([(doc,) for doc in newsgroups.data], ["doc"])
df.show(1,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|doc                                                                                                                                                                                                                                                                             

### Preprocessing the data for LDA to remove non-alphabetic characters

In [48]:
from pyspark.sql.functions import regexp_replace
# Define a UDF to apply regex and remove non-alphabetic characters
def clean_text(doc):
    cleaned_doc = re.sub(r"[^A-Za-z]", " ", doc)
    return " ".join(cleaned_doc.split())  # Remove extra spaces

# Register the UDF
clean_text_udf = udf(clean_text, StringType())

# Apply regex 
df_cleaned = df.withColumn("cleaned_doc", clean_text_udf("doc"))
# Replace newline characters with a space
df_cleaned = df.withColumn("cleaned_text", regexp_replace("doc", "\n", " "))
df_cleaned.select("cleaned_text").show(1,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|cleaned_text                                                                                                                                                                                                                                                                                

### Tokenize,remove stop words and count vectorize

In [49]:
# Apply Tokenizer
tokenizer = Tokenizer(inputCol="cleaned_text", outputCol="tokens")
df_tokenized = tokenizer.transform(df_cleaned)

# Apply StopWordsRemover
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_doc")
df_filtered = stopwords_remover.transform(df_tokenized).select("filtered_doc")



# Filter array elements with at least 4 characters
df_filtered = df_filtered.withColumn("filtered_array",expr("filter(filtered_doc, x -> len(x) >= 4)"))
df_filtered = df_filtered.select("filtered_array")
df_filtered.show(1,truncate=False)

# Apply CountVectorizer
count_vec = CountVectorizer(inputCol="filtered_array", outputCol="count_vec" ,vocabSize=num_features, minDF=2.0)

count_vec_model = count_vec.fit(df_filtered)


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|filtered_array                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+-------------------------------------------------------------

                                                                                

### Apply IDF

In [50]:
vocab = count_vec_model.vocabulary
newsgroups = count_vec_model.transform(df_filtered)
#newsgroups = newsgroups.drop('filtered_array')
    

# Apply IDF
idf = IDF(inputCol="count_vec", outputCol="features")
newsgroups = idf.fit(newsgroups).transform(newsgroups)
newsgroups = newsgroups.drop('tf_features')


                                                                                

### Apply LDA

In [53]:
from pyspark.ml.clustering import LDA
lda = LDA(k=num_topics, featuresCol="features", seed=0)
model = lda.fit(newsgroups)
transformed_data = model.transform(newsgroups)
transformed_data.show(5)

topics = model.describeTopics()
topics.show(5)

model.topicsMatrix()
    
topics_rdd = topics.rdd

topics_words = topics_rdd\
    .map(lambda row: row['termIndices'])\
    .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
    .collect()

                                                                                

+--------------------+--------------------+--------------------+--------------------+
|      filtered_array|           count_vec|            features|   topicDistribution|
+--------------------+--------------------+--------------------+--------------------+
|[sure, bashers, p...|(8000,[3,10,17,20...|(8000,[3,10,17,20...|[2.16703122129341...|
|[brother, market,...|(8000,[16,25,132,...|(8000,[16,25,132,...|[4.53226407567457...|
|[finally, said, d...|(8000,[0,2,17,19,...|(8000,[0,2,17,19,...|[1.38682346408836...|
|[think!, scsi, ca...|(8000,[13,49,83,8...|(8000,[13,49,83,8...|[2.05397425634834...|
|[jasmine, drive, ...|(8000,[2,13,16,23...|(8000,[2,13,16,23...|[3.28641722820767...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[7, 756, 215, 344...|[0.11837926504713...|
|

### Showing the topics

In [54]:
for idx, topic in enumerate(topics_words):
    print ("topic: ", idx)
    print ("----------")
    for word in topic:
        print (word)  # word
    print ("----------")

topic:  0
----------
max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'
myers:
president
health
administration
think
congress
going
jobs
white
----------
topic:  1
----------
armenian
armenians
jews
turkish
people
rights
jewish
israeli
right
government
----------
topic:  2
----------
window
available
version
server
file
files
anonymous
widget
motif
windows
----------
topic:  3
----------
vitamin
year
number
baseball
play
medical
year.
last
players
games
----------
topic:  4
----------
university
1993
greek
page
professor
objective
april
science
conference
space
----------
topic:  5
----------
israel
good
israeli
excellent
great
soldiers
playing
missing
arabs
cubs
----------
topic:  6
----------
file
data
entry
bits
code
char
source
oname,
output
files
----------
topic:  7
----------
subject:
message
theory
know
make
lines
computer
like
science
using
----------
topic:  8
----------
image
images
processing
graphics
data
batf
address
format
formats
contact:
----------
topic:  9

### Visualize the clusters (using the first two components)

In [None]:
# Visualize the clusters (using the first two components)
import matplotlib.pyplot as plt
topics = model.describeTopics(10)
topic_words = topics.select("termIndices").rdd.flatMap(lambda x: x[0]).collect()
vocab = count_vec_model.vocabulary
topic_words_list = [[vocab[i] for i in [indices]] for indices in topic_words]

plt.figure(figsize=(10, 6))
for i, words in enumerate(topic_words_list):
    plt.scatter(transformed_data.select("topicDistribution").collect()[i][0][0],
                transformed_data.select("topicDistribution").collect()[i][0][1],
                label=f"Topic {i}", s=100)
    for word in words:
        plt.annotate(word, (transformed_data.select("topicDistribution").collect()[i][0][0],
                            transformed_data.select("topicDistribution").collect()[i][0][1]),
                     textcoords="offset points", xytext=(0, 5), ha="center")

plt.title("LDA Clustering (20-Newsgroups)")
plt.xlabel("Topic 0")
plt.ylabel("Topic 1")
plt.legend()
plt.show()

                                                                                