# News Topic modeling


## I- Modules import

In [182]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession,Row
from pyspark.ml.feature import CountVectorizer,IDF
from pyspark.ml import Pipeline
from pyspark.sql.functions import col,explode,split
from pyspark.ml.clustering import LDA

## II- Spark context and session creation

In [188]:
spark = (SparkSession.builder
    .master("spark://node02:7077")
    .appName("NewsTopicModeling")
    .getOrCreate()
        )
spark

In [189]:
#spark.stop()

## III- Dataframe preparing

### 1. Load the data

This is a clustering task which is an unsipervised Machine Learning approach. We then do not need to load the cateegories.

In [190]:
# Load the data: Only the filtered descritpion
df = spark.read.parquet("input/news.parquet", header=True, inferSchema=True).select('description_filtered')

                                                                                

### 2. Partition and cache the dataframe

In [192]:
df.rdd.getNumPartitions()

9

In [193]:
# Use 4 partitions per core
num_partitions=4*40
df= df.repartition(num_partitions).cache()

In [194]:
df.rdd.getNumPartitions()



160

### 3. Preview the data

In [195]:
df.count()

                                                                                

1716608

In [196]:
df.show()

+--------------------+
|description_filtered|
+--------------------+
|republic fiji sta...|
|calif schooltime ...|
|enlist enraged ho...|
|101 effective sim...|
|united arab repub...|
|big detonation ca...|
|highpriced troika...|
|community harvest...|
|year blackfriar d...|
|11 papa yankee st...|
|lilliputian girlf...|
|finally atomic nu...|
|class opportunity...|
|dog live furbulou...|
|boulevard star ro...|
|  fuck better others|
|superhero promena...|
|film review james...|
|taiwan milepost d...|
|cecile ivor armst...|
+--------------------+
only showing top 20 rows



In [197]:
df.printSchema()

root
 |-- description_filtered: string (nullable = true)



### 4. Convert filtered descriptions to arrays

In [198]:
# Create a new DataFrame with description_filtered as arrays
df= df.withColumn('description_filtered', split(col('description_filtered'), ' '))
# Show the new DataFrame
df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------+
|description_filtered                                                                                      |
+----------------------------------------------------------------------------------------------------------+
|[republic, fiji, starting, time, respondent, train, address, sexual, force, tragedy]                      |
|[calif, schooltime, fire, teacher, mock, pupil, participating, immigrant, bang]                           |
|[enlist, enraged, hoi, polloi, vow, end, alzheimer, 2020]                                                 |
|[101, effective, simple, drill, eff, deeply]                                                              |
|[united, arab, republic, yisrael, palestinian, arab]                                                      |
|[big, detonation, careen, confederate, state, america, la, police, capture, illegal, pyrotechnic]         |
|[highpriced, troik

In [199]:
#df=df.select('description_filtered')
#df.show()

## IV- Feature Engineering


### 1. Explode the filtered descriptions to get the words

In [200]:
exploded_df=df.select(explode(df.description_filtered)).alias('words')
exploded_df.show()

+-------------+
|          col|
+-------------+
|     republic|
|         fiji|
|     starting|
|         time|
|   respondent|
|        train|
|      address|
|       sexual|
|        force|
|      tragedy|
|        calif|
|   schooltime|
|         fire|
|      teacher|
|         mock|
|        pupil|
|participating|
|    immigrant|
|         bang|
|       enlist|
+-------------+
only showing top 20 rows



### 2. Get unique words in the filtered_description

In [202]:
unique_words=exploded_df.distinct()

### 3. Cache and show the unique words dataframe

In [203]:
unique_words=unique_words.cache()
unique_words.show()



+----------+
|       col|
+----------+
|      hope|
|       art|
|     oscar|
|    online|
|     crest|
|     trail|
|      iffy|
|   melodic|
| traveling|
|     still|
|    gloria|
|    poetry|
|      pant|
|    honcho|
|  medicare|
| recognize|
|    filing|
|  vladimir|
|rejuvenate|
|   alquran|
+----------+
only showing top 20 rows



                                                                                

### 4. Get the vocabulary size

In [204]:
vocabulary_size=unique_words.count()
vocabulary_size

128622

### 5. Define the CountVectorizer and IDF stages

In [205]:
# Define the HashingTF and IDF stages
vectorizer = CountVectorizer(inputCol="description_filtered", outputCol="raw_features",vocabSize=vocabulary_size, minDF=3.0)
idf = IDF(inputCol="raw_features", outputCol="features")

## V- Models set up, training and evaluation

### 1. Set up LDA model

Let us define our model
We will use 30 topics since: 30 is close to the number of categories in our news data.

In [210]:
num_topics = 30
#lda = LDA(k=num_topics, maxIter=10)
lda = LDA(featuresCol="features",seed=0,k=num_topics)
lda

LDA_b97404b1c336

### 2. Set up pipeline

We will  set up the pipeline of the following transformations for LDA

- CountVectorizer
- IDF

In [211]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import LDA
 
# Create pipeline for LDA
pipeline = Pipeline(stages=[vectorizer, idf, lda]) 


pipeline

Pipeline_7d0652e36abc

### 3. Split the data

First of all let us split the data into train and test set: **80%** for train and **20%** for test

In [212]:
# Split data
(train_set, test_set) = df.randomSplit([0.8, 0.2], seed=0)

### 4. Model training

In [213]:
def train_model(model):    
    return model.fit(train_set)

In [214]:
fitted_model=train_model(pipeline)
fitted_model

24/06/07 11:31:53 WARN DAGScheduler: Broadcasting large task binary with size 1986.5 KiB
24/06/07 11:31:57 WARN DAGScheduler: Broadcasting large task binary with size 1986.5 KiB
24/06/07 11:31:57 WARN DAGScheduler: Broadcasting large task binary with size 2003.4 KiB
24/06/07 11:31:58 WARN DAGScheduler: Broadcasting large task binary with size 2006.9 KiB
24/06/07 11:32:05 WARN DAGScheduler: Broadcasting large task binary with size 2008.2 KiB
24/06/07 11:32:07 WARN DAGScheduler: Broadcasting large task binary with size 2003.4 KiB
24/06/07 11:32:08 WARN DAGScheduler: Broadcasting large task binary with size 2006.9 KiB
24/06/07 11:32:13 WARN DAGScheduler: Broadcasting large task binary with size 2008.2 KiB
24/06/07 11:32:15 WARN DAGScheduler: Broadcasting large task binary with size 2003.4 KiB
24/06/07 11:32:16 WARN DAGScheduler: Broadcasting large task binary with size 2006.9 KiB
24/06/07 11:32:20 WARN DAGScheduler: Broadcasting large task binary with size 2008.2 KiB
24/06/07 11:32:22 WAR

PipelineModel_00b501ec3d9e

### 5. Visualize the topics

In [215]:
fitted_vectorizer=fitted_model.stages[0]
vocabulary= fitted_vectorizer.vocabulary
len(vocabulary)

73480

In [216]:
vocabulary[:10]

['new', 'photo', 'state', 'trump', 'day', 'nt', 'say', 'woman', 'get', 'make']

In [217]:
topics = fitted_model.stages[-1].describeTopics()   
topics.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[52, 430, 465, 40...|[0.00755588702618...|
|    1|[439, 792, 516, 6...|[0.00715646069959...|
|    2|[256, 279, 1, 591...|[0.00689308452009...|
|    3|[56, 96, 5, 313, ...|[0.01890835473680...|
|    4|[291, 456, 703, 8...|[0.00919124056313...|
|    5|[375, 697, 782, 8...|[0.00636053853579...|
|    6|[146, 235, 198, 9...|[0.00652703106352...|
|    7|[135, 99, 155, 31...|[0.01118587124523...|
|    8|[311, 858, 0, 103...|[0.00809170294982...|
|    9|[26, 195, 231, 21...|[0.01013236656762...|
|   10|[16, 32, 395, 55,...|[0.01303652311645...|
|   11|[21, 92, 37, 3, 1...|[0.02038315645753...|
|   12|[384, 492, 237, 5...|[0.00808566906139...|
|   13|[31, 242, 440, 54...|[0.01250681082572...|
|   14|[587, 263, 750, 7...|[0.00629233566886...|
|   15|[149, 409, 333, 7...|[0.00759634333462...|
|   16|[4, 75, 192, 352,...|[0.01246747117105...|


In [218]:
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocabulary[idx] for idx in idx_list])\
       .collect()
topics_words[:2]

                                                                                

[['republic',
  'monophosphate',
  'deoxyadenosine',
  'advice',
  'china',
  'street',
  'side',
  'cuba',
  'despite',
  'control'],
 ['vacation',
  'halloween',
  'everything',
  'testament',
  'meditation',
  'thanksgiving',
  'better',
  'edward',
  'everyone',
  'williams']]

In [219]:
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
republic
monophosphate
deoxyadenosine
advice
china
street
side
cuba
despite
control
*************************
topic: 1
*************************
vacation
halloween
everything
testament
meditation
thanksgiving
better
edward
everyone
williams
*************************
topic: 2
*************************
system
prison
photo
term
future
shoot
michael
governor
base
rescue
*************************
topic: 3
*************************
marriage
sexual
nt
ceremony
history
wo
real
assault
life
proposal
*************************
topic: 4
*************************
father
dad
eat
minute
lifetime
fighting
say
economic
european
leslie
*************************
topic: 5
*************************
federal
investigation
song
july
attorney
upwards
carolina
ve
thomas
note
*************************
topic: 6
*************************
angstrom
unit
tree
lot
mask
san
pine
someone
trust
club
*************************
topic: 7
*************************
space
national
really
admin

### Get topics distributions

In [220]:
# Transform the training and test data
train_set_transformed = fitted_model.transform(train_set)
test_set_transformed = fitted_model.transform(test_set)

# Get the LDA model from the pipeline model
lda_model = fitted_model.stages[-1]

# Extract the topic distributions
train_topic_distributions = train_set_transformed.select("description_filtered", "topicDistribution")
test_topic_distributions = test_set_transformed.select("description_filtered", "topicDistribution")

In [221]:
# Show the topic distributions for the training set
train_topic_distributions.show(truncate=False)

# Show the topic distributions for the test set
test_topic_distributions.show(truncate=False)

24/06/07 11:34:20 WARN DAGScheduler: Broadcasting large task binary with size 18.7 MiB


+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description_filtered                                                                                                            |topicDistribution                                                                       

24/06/07 11:34:20 WARN DAGScheduler: Broadcasting large task binary with size 18.7 MiB


+-----------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description_filtered                                                                     |topicDistribution                                                                                                                                                      

### 5. Evaluate the model

To evaluate our model, we use log perplexity as evaluation metric.
The evaluation is done on both train and test split.

In [222]:
# Function to evaluate model and get best parameters
def evaluate_model(fitted_model,data_transformed=[train_set_transformed,test_set_transformed]):
    
    print('Evaluating the model on training set')
    train_lp = fitted_model.logPerplexity(data_transformed[0])

    print('Evaluating the model on test set')
    test_lp = fitted_model.logPerplexity(data_transformed[1])
    
    print("The upper bound on perplexity for train set: " + str(train_lp))
    print("The upper bound on perplexity for test set: " + str(test_lp))
    return train_lp, test_lp

In [None]:
train_lp,test_lp=evaluate_model(fitted_model.stages[-1])
train_lp,test_lp

Evaluating the model on training set


24/06/07 12:02:26 WARN DAGScheduler: Broadcasting large task binary with size 1987.3 KiB
24/06/07 13:01:45 WARN DAGScheduler: Broadcasting large task binary with size 1988.2 KiB
[Stage 157:>                                                     (1 + 40) / 160]

### 6. Interpret the results

Our model sets a log perplexity of  on the train set and ... on the test set. This implies that the model does overfit since there is no significanc edifference between the log perplexity.

In [224]:
model=fitted_model.stages[-1]
model

LocalLDAModel: uid=LDA_b97404b1c336, k=30, numFeatures=73480

### 7. Save the best model

In [225]:
model.save('output/news_topics_detection_model')

24/06/07 11:59:12 WARN TaskSetManager: Stage 153 contains a task of very large size (17584 KiB). The maximum recommended task size is 1000 KiB.

## VI- Summary

In this notebook we have studied created LDA model to perform topic detection. We use 30 as number of topics.

In [None]:
# Remove the cache and stop the spark session

#df.unpersist()
#spark.stop()