# News Topic modeling


## I- Modules import

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import  IDF, HashingTF,CountVectorizer
from pyspark.ml import  Pipeline
from math import ceil,log2
from pyspark.ml.classification import LogisticRegression,NaiveBayes,LogisticRegressionModel
from pyspark.sql.functions import col,explode,split
from pyspark.ml import Pipeline

from pyspark.ml.clustering import LDA


import numpy as np


## II- Spark context and session creation

In [None]:
spark = (SparkSession.builder
    .master("spark://node02:7077")
    .appName("NewsTopicModeling")
    .getOrCreate()
        )
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/08 15:20:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## III- Dataframe preparing

### 1. Load the data

In [None]:
# Load data
df = spark.read.parquet("input/news.parquet", header=True, inferSchema=True)

                                                                                

### 2. Partition and cache the dataframe

In [None]:
df.rdd.getNumPartitions()

9

In [None]:
num_partitions=4*20
df= df.repartition(num_partitions).cache()

In [None]:
df.rdd.getNumPartitions()



80

### 3. Preview the data

In [None]:
df.count()

                                                                                

1716608

In [None]:
df.show()

+--------------+--------------------+
|category_label|description_filtered|
+--------------+--------------------+
|          17.0|fourth july nupti...|
|          18.0|marilyn marilyn m...|
|          19.0|protrump troll ta...|
|          20.0|huffpost story gr...|
|          17.0|oct represent upr...|
|          20.0|yoga aid maine ho...|
|          19.0|portrait postapar...|
|          19.0|remuneration care...|
|          17.0|barbie ken marria...|
|          19.0|roald dahl true g...|
|          16.0|beyonc nt need sy...|
|          19.0|solidness gold li...|
|          19.0|bront society mel...|
|          17.0|wedding donts dig...|
|          19.0|dale chihuly youn...|
|          18.0|child divorce kid...|
|          17.0|bridal marketplac...|
|          16.0|tyrese apologizes...|
|          20.0|preacher speaks m...|
|          16.0|marcus antonius a...|
+--------------+--------------------+
only showing top 20 rows



In [None]:
df.printSchema()

root
 |-- category_label: double (nullable = true)
 |-- description_filtered: string (nullable = true)



### 4. Convert filtered descriptions to arrays

In [None]:
# Create a new DataFrame with description_filtered as arrays
df= df.withColumn('description_filtered', split(col('description_filtered'), ' '))
# Show the new DataFrame
df.show(truncate=False)

+--------------+--------------------------------------------------------------------------------------------------+
|category_label|description_filtered                                                                              |
+--------------+--------------------------------------------------------------------------------------------------+
|17.0          |[fourth, july, nuptials, mate, recreates, big, solar, day, every, year]                           |
|18.0          |[marilyn, marilyn, monroe, quote, dearest, honour, death, day, remembrance]                       |
|19.0          |[protrump, troll, target, megyn, kelly, new, book, amazon, river]                                 |
|20.0          |[huffpost, story, grisly, killing, show, rampant, transphobia, brazil]                            |
|17.0          |[oct, represent, upright, calendar, month, trade, hymeneal, garb]                                 |
|20.0          |[yoga, aid, maine, honey, bigger, consistency]          

## IV- Feature Engineering


### 1. Explode the filtered descriptions to get the words

In [None]:
exploded_df=df.select(explode(df.description_filtered)).alias('words')
exploded_df.show()

+-----------+
|        col|
+-----------+
|     fourth|
|       july|
|   nuptials|
|       mate|
|  recreates|
|        big|
|      solar|
|        day|
|      every|
|       year|
|    marilyn|
|    marilyn|
|     monroe|
|      quote|
|    dearest|
|     honour|
|      death|
|        day|
|remembrance|
|   protrump|
+-----------+
only showing top 20 rows



### 2. Get unique words in the filtered_description

In [None]:
unique_words=exploded_df.distinct()

### 3. Cache and show the unique words dataframe

In [None]:
unique_words=unique_words.cache()
unique_words.show()



+---------+
|      col|
+---------+
|    1970s|
|   travel|
|      art|
|    32812|
|    oscar|
|     pant|
|    hobbs|
|   outfit|
|    salma|
|   voyage|
|   patton|
|   online|
| everyday|
|traveling|
|  letdown|
|     hope|
|   teigen|
|recognize|
|     foxy|
|    still|
+---------+
only showing top 20 rows



                                                                                

### 4. Get the vocabulary size

In [None]:
vocabulary_size=unique_words.count()
vocabulary_size

128622

In [None]:
minDF=3.0 # The minimum document frequency

### 5. Define the CountVectorizer and IDF stages

In [None]:
# Define the HashingTF and IDF stages
vectorizer = CountVectorizer(inputCol="description_filtered", outputCol="raw_features",vocabSize=vocabulary_size, minDF=minDF)
idf = IDF(inputCol="raw_features", outputCol="features")

## V- Models set up, training and evaluation

### 1. Set up LDA model

We instanciate LDA model with 30 topics

We choose 30 since our data set contains 32 categories.

In [None]:
num_topics = 30
# LDA model with 30 topics
lda = LDA(featuresCol="features",seed=0,k=num_topics)
lda

LDA_aa7f50f63c85

### 2. Set up pipelines

We will  set up the pipelines of the following transformations for Naive Bayes and Linear reggression

- CountVectorizer
- IDF

In [None]:
# Create pipeline for LDA
pipeline = Pipeline(stages=[vectorizer, idf, lda])


pipeline

Pipeline_d0dd0490e003

### 3. Split the data

First of all let us split the data into train and test set: 80% for train and 20% for test

In [None]:
# Split data
(train_set, test_set) = df.randomSplit([0.8, 0.2], seed=0)

### 4. Create a function for model training

Let us create a function which takes as argument a model that it trains and then returns the trained model.

In [None]:
def train_model(model):
    return model.fit(train_set)

In [None]:
fitted_model=train_model(pipeline)
fitted_model

24/06/08 15:25:37 WARN DAGScheduler: Broadcasting large task binary with size 1984.0 KiB
24/06/08 15:25:40 WARN DAGScheduler: Broadcasting large task binary with size 1984.0 KiB
24/06/08 15:25:41 WARN DAGScheduler: Broadcasting large task binary with size 2000.9 KiB
24/06/08 15:25:42 WARN DAGScheduler: Broadcasting large task binary with size 2004.4 KiB
24/06/08 15:25:48 WARN DAGScheduler: Broadcasting large task binary with size 2005.7 KiB
24/06/08 15:25:50 WARN DAGScheduler: Broadcasting large task binary with size 2000.9 KiB
24/06/08 15:25:51 WARN DAGScheduler: Broadcasting large task binary with size 2004.4 KiB
24/06/08 15:25:55 WARN DAGScheduler: Broadcasting large task binary with size 2005.7 KiB
24/06/08 15:25:57 WARN DAGScheduler: Broadcasting large task binary with size 2000.9 KiB
24/06/08 15:25:58 WARN DAGScheduler: Broadcasting large task binary with size 2004.4 KiB
24/06/08 15:26:01 WARN DAGScheduler: Broadcasting large task binary with size 2005.7 KiB
24/06/08 15:26:03 WAR

PipelineModel_1e76b8ba508c

### 5. Visualize the topics

In [None]:
fitted_vectirizer=fitted_model.stages[0]
vocabulary= fitted_vectirizer.vocabulary
len(vocabulary)

73411

In [None]:
vocabulary[:10]

['new', 'photo', 'state', 'trump', 'day', 'nt', 'say', 'woman', 'get', 'make']

In [None]:
topics = fitted_model.stages[-1].describeTopics()
topics.show()

topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocabulary[idx] for idx in idx_list])\
       .collect()
topics_words[:2]

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[31, 137, 101, 31...|[0.00985414811177...|
|    1|[107, 195, 330, 3...|[0.01330611945988...|
|    2|[30, 16, 57, 216,...|[0.01347184457240...|
|    3|[379, 205, 622, 4...|[0.00632639774673...|
|    4|[133, 282, 103, 4...|[0.00819584782418...|
|    5|[673, 430, 776, 6...|[0.00602703665300...|
|    6|[100, 123, 958, 8...|[0.01240321325915...|
|    7|[230, 328, 366, 3...|[0.01068304711893...|
|    8|[389, 541, 235, 5...|[0.00606431922978...|
|    9|[26, 56, 312, 174...|[0.02147667718198...|
|   10|[21, 3, 36, 191, ...|[0.01721008314916...|
|   11|[46, 2, 352, 169,...|[0.01180906612486...|
|   12|[179, 114, 397, 5...|[0.01102264094365...|
|   13|[534, 335, 765, 7...|[0.00627261569473...|
|   14|[426, 386, 575, 7...|[0.00693743278268...|
|   15|[694, 175, 584, 1...|[0.00462890601493...|
|   16|[814, 537, 737, 1...|[0.00570331910335...|


                                                                                

[['best',
  'space',
  'national',
  'administration',
  'recipe',
  'aeronautics',
  'ever',
  'date',
  'charles',
  'herbert'],
 ['change',
  'climate',
  'pope',
  'trump',
  'francis',
  'take',
  'hope',
  'small',
  '2016',
  'movement']]

In [None]:
topics_words_dict={}
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    topic_words_array=[]
    for word in topic:
        print(word)
        topic_words_array.append(word)
    topics_words_dict[idx]=topic_words_array
    print("*"*25)

topic: 0
*************************
best
space
national
administration
recipe
aeronautics
ever
date
charles
herbert
*************************
topic: 1
*************************
change
climate
pope
trump
francis
take
hope
small
2016
movement
*************************
topic: 2
*************************
coronavirus
covid19
case
update
new
2020
death
concern
report
30
*************************
topic: 3
*************************
vitamin
better
happy
become
24hour
interval
tie
shopping
day
fresh
*************************
topic: 4
*************************
law
cause
sex
problem
china
learn
stay
aid
winter
serve
*************************
topic: 5
*************************
lost
lesson
weight
final
biden
angle
joe
meditation
hour
hollywood
*************************
topic: 6
*************************
great
dog
taylor
kingdom
obama
talking
unwashed
security
track
britain
*************************
topic: 7
*************************
clinton
hillary
name
hotel
vaccine
really
monophosphate
deoxyadenosi

In [None]:
topics_words_dict

{0: ['best',
  'space',
  'national',
  'administration',
  'recipe',
  'aeronautics',
  'ever',
  'date',
  'charles',
  'herbert'],
 1: ['change',
  'climate',
  'pope',
  'trump',
  'francis',
  'take',
  'hope',
  'small',
  '2016',
  'movement'],
 2: ['coronavirus',
  'covid19',
  'case',
  'update',
  'new',
  '2020',
  'death',
  'concern',
  'report',
  '30'],
 3: ['vitamin',
  'better',
  'happy',
  'become',
  '24hour',
  'interval',
  'tie',
  'shopping',
  'day',
  'fresh'],
 4: ['law',
  'cause',
  'sex',
  'problem',
  'china',
  'learn',
  'stay',
  'aid',
  'winter',
  'serve'],
 5: ['lost',
  'lesson',
  'weight',
  'final',
  'biden',
  'angle',
  'joe',
  'meditation',
  'hour',
  'hollywood'],
 6: ['great',
  'dog',
  'taylor',
  'kingdom',
  'obama',
  'talking',
  'unwashed',
  'security',
  'track',
  'britain'],
 7: ['clinton',
  'hillary',
  'name',
  'hotel',
  'vaccine',
  'really',
  'monophosphate',
  'deoxyadenosine',
  'town',
  'san'],
 8: ['head',
  'gr

### 6. Get topics distributions

In [None]:
# Transform the training and test data
train_set_transformed = fitted_model.transform(train_set)
test_set_transformed = fitted_model.transform(test_set)

# Get the LDA model from the pipeline model
lda_model = fitted_model.stages[-1]

# Extract the topic distributions
train_topic_distributions = train_set_transformed.select("description_filtered", "topicDistribution")
test_topic_distributions = test_set_transformed.select("description_filtered", "topicDistribution")

In [None]:
# Show the topic distributions for the training set
train_topic_distributions.show(truncate=False)

# Show the topic distributions for the test set
test_topic_distributions.show(truncate=False)

24/06/08 15:27:58 WARN DAGScheduler: Broadcasting large task binary with size 18.7 MiB


+-----------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description_filtered                                                                                             |topicDistribution                                                                                                     

24/06/08 15:27:59 WARN DAGScheduler: Broadcasting large task binary with size 18.7 MiB


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description_filtered                                                                                                                                                        |topic

In [None]:
topics_words_array

['herbert',
 'movement',
 '30',
 'fresh',
 'serve',
 'hollywood',
 'britain',
 'san',
 'car',
 'nt',
 'word',
 'together',
 'embody',
 'reality',
 '14',
 'police',
 'flight',
 'district',
 'day',
 'prison',
 'look',
 'antiophthalmic',
 'investment',
 'day',
 'class',
 'h',
 'pop',
 'love',
 'union',
 'thanksgiving']

### 8. Save the model and other important data

In [None]:
fitted_model.save('output/news_topic_model')

24/06/08 16:15:31 WARN TaskSetManager: Stage 153 contains a task of very large size (17567 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [None]:
vocabulary_size

128622

In [None]:
#len(vocabulary)

73411

In [None]:
import json

# Define your data
data = {
    "minDF": minDF,
    "vocabulary_size": vocabulary_size,

}

# Specify the file name
file_name = 'output/news_topic_model/extra.json'

# Write the data to a JSON file
with open(file_name, 'w') as json_file:
    json.dump(data, json_file, indent=4)

print(f"Data successfully written to {file_name}")


Data successfully written to output/news_topic_model/extra.json


In [None]:
# Specify the file name
file_name = 'output/news_topic_model/topics.json'

# Write the data to a JSON file
with open(file_name, 'w') as json_file:
    json.dump(topics_words_dict, json_file, indent=4)

print(f"Data successfully written to {file_name}")

Data successfully written to output/news_topic_model/topics.json


## VII- Summary

In this notebook we have studied two models for our news categorization task. There are Naive Bayes and Logistic regression.

 Our study reveals that the Logistic regression was the one with best performance.

 Then we tunned the Logistic regression hyperparameters using grid search and then we find the best model that we save.

 The next step of our work will be to ...

In [None]:
#df.unpersist()
#spark.stop()