# News Topic modeling


## I- Modules import

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import  IDF, HashingTF,CountVectorizer
from pyspark.ml import  Pipeline
from math import ceil,log2
from pyspark.ml.classification import LogisticRegression,NaiveBayes,LogisticRegressionModel
from pyspark.sql.functions import col,explode,split

import numpy as np
from pyspark.ml.clustering import LDA

## II- Spark context and session creation

In [2]:
spark = (SparkSession.builder
    .master("spark://node02:7077")
    .appName("NewsTopicModeling")
    .getOrCreate()
        )
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/06 06:07:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## III- Dataframe preparing

### 1. Load the data

In [3]:
# Load data
df = spark.read.parquet("input/news.parquet", header=True, inferSchema=True)

                                                                                

In [4]:
#spark.stop()

### 2. Partition and cache the dataframe

In [5]:
df.rdd.getNumPartitions()

9

In [6]:
num_partitions=4*20
df= df.repartition(num_partitions).cache()

In [7]:
df.rdd.getNumPartitions()



80

### 3. Preview the data

In [8]:
df.count()

                                                                                

1716608

In [9]:
df.show()

+--------------+--------------------+
|category_label|description_filtered|
+--------------+--------------------+
|           7.0|orchard apple tre...|
|           8.0|devolution sectio...|
|           5.0|live leonard bloo...|
|           4.0|top u general say...|
|           9.0|daddy chore go by...|
|           7.0|metamorphosis kaf...|
|           8.0|soundbox miss 4ye...|
|           8.0|90 covid19 patien...|
|           6.0|appealingness med...|
|           9.0|nurture tiddler e...|
|           7.0|infinix zero 8 me...|
|           5.0|new figure lake o...|
|           6.0|medico get seriou...|
|           8.0|bbc bias ruling c...|
|           8.0|microsoft buy tik...|
|           9.0|lie told moment baby|
|           7.0|tinder automaton ...|
|           9.0|yr one fair sex j...|
|           4.0|president mnangag...|
|           6.0|actually tap wate...|
+--------------+--------------------+
only showing top 20 rows



In [10]:
df.printSchema()

root
 |-- category_label: double (nullable = true)
 |-- description_filtered: string (nullable = true)



### 4. Convert filtered descriptions to arrays

In [11]:
# Create a new DataFrame with description_filtered as arrays
df= df.withColumn('description_filtered', split(col('description_filtered'), ' '))
# Show the new DataFrame
df.show(truncate=False)

+--------------+-----------------------------------------------------------------------------------------------------------+
|category_label|description_filtered                                                                                       |
+--------------+-----------------------------------------------------------------------------------------------------------+
|7.0           |[orchard, apple, tree, pulsation, powerbeats, pro, earbuds, crazy, tinny, today, exclusively, refurbished] |
|8.0           |[devolution, section, staff, isolate, aureole, case]                                                       |
|5.0           |[live, leonard, bloomfield, give, prescribed, covid19, update, 1pm, latest, subject, figure]               |
|4.0           |[top, u, general, say, north, korea, military, posture, unchanged, amid, tension]                          |
|9.0           |[daddy, chore, go, byebye]                                                                                 |


## IV- Feature Engineering


### 1. Explode the filtered descriptions to get the words

In [12]:
exploded_df=df.select(explode(df.description_filtered)).alias('words')
exploded_df.show()

+-----------+
|        col|
+-----------+
|    orchard|
|      apple|
|       tree|
|  pulsation|
| powerbeats|
|        pro|
|    earbuds|
|      crazy|
|      tinny|
|      today|
|exclusively|
|refurbished|
| devolution|
|    section|
|      staff|
|    isolate|
|    aureole|
|       case|
|       live|
|    leonard|
+-----------+
only showing top 20 rows



In [13]:
#df=df.unpersist()

### 2. Get unique words in the filtered_description

In [14]:
unique_words=exploded_df.distinct()

### 3. Cache and show the unique words dataframe

In [15]:
unique_words=unique_words.cache()
unique_words.show()



+---------+
|      col|
+---------+
| inverted|
|    mammy|
|    oscar|
|   online|
|    poppy|
|    still|
|   travel|
|traveling|
|  barrier|
|  elevate|
|  jewelry|
|     pant|
|      art|
|recognize|
|   patton|
| carnegie|
|  blossom|
|    inner|
|   90hour|
|  quatern|
+---------+
only showing top 20 rows



                                                                                

### 4. Get the vocabulary size

In [16]:
vocabulary_size=unique_words.count()
vocabulary_size

128622

### 5. Define the CountVectorizer and IDF stages

In [18]:
# Define the HashingTF and IDF stages
vectorizer = CountVectorizer(inputCol="description_filtered", outputCol="raw_features",vocabSize=vocabulary_size, minDF=3.0)
idf = IDF(inputCol="raw_features", outputCol="features")

## V- Models set up, training and evaluation

### 1. Set up LDA model

In [19]:
#num_topics = 20
#lda = LDA(k=num_topics, maxIter=10)
lda = LDA(featuresCol="features",seed=0)
lda

LDA_b18524a02c25

### 2. Set up pipelines

We will  set up the pipelines of the following transformations for Naive Bayes and Linear reggression

- CountVectorizer
- IDF

In [20]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import LDA
 
# Create pipeline for LDA
pipeline = Pipeline(stages=[vectorizer, idf, lda]) 


pipeline

Pipeline_ccc22729faf7

### 3. Split the data

First of all let us split the data into train and test set: 80% for train and 20% for test

In [21]:
# Split data
(train_set, test_set) = df.randomSplit([0.8, 0.2], seed=0)

### 4. Create a function for model training

Let us create a function which takes as argument a model that it trains and then returns the trained model.

In [22]:
def train_model(model):    
    return model.fit(train_set)

In [23]:
fitted_model=train_model(pipeline)
fitted_model

24/06/06 06:12:04 WARN DAGScheduler: Broadcasting large task binary with size 1984.0 KiB
24/06/06 06:12:08 WARN DAGScheduler: Broadcasting large task binary with size 1984.0 KiB
24/06/06 06:12:08 WARN DAGScheduler: Broadcasting large task binary with size 2000.9 KiB
24/06/06 06:12:11 WARN DAGScheduler: Broadcasting large task binary with size 2004.0 KiB
24/06/06 06:12:16 WARN DAGScheduler: Broadcasting large task binary with size 2005.1 KiB
24/06/06 06:12:18 WARN DAGScheduler: Broadcasting large task binary with size 2000.9 KiB
24/06/06 06:12:19 WARN DAGScheduler: Broadcasting large task binary with size 2004.0 KiB
24/06/06 06:12:23 WARN DAGScheduler: Broadcasting large task binary with size 2005.1 KiB
24/06/06 06:12:27 WARN DAGScheduler: Broadcasting large task binary with size 2000.9 KiB
24/06/06 06:12:28 WARN DAGScheduler: Broadcasting large task binary with size 2004.0 KiB
24/06/06 06:12:30 WARN DAGScheduler: Broadcasting large task binary with size 2005.1 KiB
24/06/06 06:12:34 WAR

PipelineModel_6308ba4e1161

### 5. Visualize the topics

In [25]:
fitted_vectirizer=fitted_model.stages[0]
vocabulary= fitted_vectirizer.vocabulary
len(vocabulary)

73411

In [29]:
vocabulary[:10]

['new', 'photo', 'state', 'trump', 'day', 'nt', 'say', 'woman', 'get', 'make']

In [34]:
topics = fitted_model.stages[-1].describeTopics()   
topics.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[137, 4, 2, 7, 31...|[0.00416672987347...|
|    1|[3, 107, 36, 195,...|[0.00435582030622...|
|    2|[16, 30, 0, 57, 1...|[0.00500547082609...|
|    3|[4, 1, 14, 11, 26...|[0.00423027191759...|
|    4|[27, 2, 0, 47, 74...|[0.00264409013447...|
|    5|[35, 187, 114, 0,...|[0.00352678941107...|
|    6|[100, 0, 21, 53, ...|[0.00365710313344...|
|    7|[230, 328, 3, 438...|[0.00399745479961...|
|    8|[1, 0, 8, 33, 7, ...|[0.00347085298628...|
|    9|[21, 26, 56, 312,...|[0.00446111456976...|
+-----+--------------------+--------------------+



In [38]:
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocabulary[idx] for idx in idx_list])\
       .collect()
topics_words[:2]

[['space',
  'day',
  'state',
  'woman',
  'administration',
  'national',
  'love',
  'best',
  'tree',
  'aeronautics'],
 ['trump',
  'change',
  'donald',
  'climate',
  'pope',
  'photo',
  'food',
  'take',
  'right',
  'say']]

In [39]:
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
space
day
state
woman
administration
national
love
best
tree
aeronautics
*************************
topic: 1
*************************
trump
change
donald
climate
pope
photo
food
take
right
say
*************************
topic: 2
*************************
covid19
coronavirus
new
case
r
2020
bank
year
u
ha
*************************
topic: 3
*************************
day
photo
5
video
wedding
make
way
new
person
nt
*************************
topic: 4
*************************
american
state
new
child
white
sexual
problem
law
nt
learn
*************************
topic: 5
*************************
police
officer
hour
new
nt
say
trump
photo
man
like
*************************
topic: 6
*************************
great
new
bank
news
house
state
york
trump
one
dog
*************************
topic: 7
*************************
clinton
hillary
trump
monophosphate
deoxyadenosine
photo
san
greater
state
joe
*************************
topic: 8
*************************
phot

### Get topics distributions

In [47]:
# Transform the training and test data
train_set_transformed = fitted_model.transform(train_set)
test_set_transformed = fitted_model.transform(test_set)

# Get the LDA model from the pipeline model
lda_model = fitted_model.stages[-1]

# Extract the topic distributions
train_topic_distributions = train_set_transformed.select("description_filtered", "topicDistribution")
test_topic_distributions = test_set_transformed.select("description_filtered", "topicDistribution")

In [48]:
# Show the topic distributions for the training set
train_topic_distributions.show(truncate=False)

# Show the topic distributions for the test set
test_topic_distributions.show(truncate=False)

24/06/06 06:28:16 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB
                                                                                

+-----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description_filtered                                                                                             |topicDistribution                                                                                                                                                                                                      |
+-----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[10

24/06/06 06:28:17 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description_filtered                                                                                                                                                        |topicDistribution                                                                                                                                                                                                     |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------

In [None]:
results={}
num_topics_range=[20, 25, 30, 35, 40, 45, 50]

for num_topics in num_topics_range:
    print('LDA for k={}'.format(num_topics))
    # Create LDA
    lda = LDA(featuresCol="features",seed=0)
    # Create pipeline for LDA
    pipeline = Pipeline(stages=[vectorizer, idf, lda])
    print('Model training')
    # Train the model
    fitted_model=train_model(pipeline)
    print('Done')
    
    train_set_transformed = fitted_model.transform(train_set)
    test_set_transformed = fitted_model.transform(test_set)
    train_lp,test_lp=evaluate_model(fitted_model,data_transformed=[train_set_transformed,test_set_transformed])
    results[num_topics]=
    
    

### 5. Define a function to evaluate the model

The function takes as parameter a fitted model, evaluates the model on train and test split and then return the train and test performance. The accuracy is the metric used.

In [57]:
# Function to evaluate model and get best parameters
def evaluate_model(fitted_model,data_transformed=[train_set_transformed,test_set_transformed]):
    
    print('Evaluating the model on training set')
    train_lp = fitted_model.logPerplexity(data_transformed[0])

    print('Evaluating the model on test set')
    test_lp = fitted_model.logPerplexity(data_transformed[1])
    
    print("The upper bound on perplexity for train set: " + str(train_lp))
    print("The upper bound on perplexity for test set: " + str(test_lp))
    return train_lp, test_lp

In [58]:
train_lp,test_lp=evaluate_model(fitted_model.stages[-1])
train_lp,test_lp

Evaluating the model on training set


24/06/06 07:44:58 WARN DAGScheduler: Broadcasting large task binary with size 1984.8 KiB
24/06/06 07:45:59 WARN DAGScheduler: Broadcasting large task binary with size 1985.5 KiB
24/06/06 07:46:52 WARN DAGScheduler: Broadcasting large task binary with size 1985.5 KiB
                                                                                

Evaluating the model on test set


24/06/06 08:12:36 WARN DAGScheduler: Broadcasting large task binary with size 1984.8 KiB
24/06/06 08:12:56 WARN DAGScheduler: Broadcasting large task binary with size 1985.5 KiB
                                                                                

The upper bound on perplexity for train set: 9.078096971523154
The upper bound on perplexity for test set: 9.135707712569403


(9.078096971523154, 9.135707712569403)

In [None]:
def 

In [None]:
#[20, 25, 30, 35, 40, 45, 50]

### 6. Create a function which takes pipelines and train the models, evaluate them and then return the results

We remark that
- Naive Bayes
- Logistic regression

We can then conclude that t
- he two models set a good performance on both training and test set.
- The Logistic regression models outperforms the Naive Bayes model

In the next section, we will tune the parameters of the Naive bayes to get the best parameters.

## VI- Logistic regression hyperparameters tuning

### 1. Pipeline creation

In [None]:
# Define parameter grids for Logistic regresion grid search
reg_values = np.logspace(-4, 4, num=100)
l1_ratios = np.linspace(0, 1, num=10)

paramGrid_lr=paramGrid_lr.addGrid(lr.regParam, reg_values).build()

# Create Cross-validation for Logistic Regression
cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_lr,
                        evaluator=MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy"),
                        numFolds=3, parallelism=1)


# Create pipeline for Logistic Regression
pipeline_lr = Pipeline(stages=[hashingTF, idf, cv_lr])

pipeline_lr

### 2. Hyperparameters tuning

In [None]:
results=train_and_evaluate_models(model_pipelines=[pipeline_lr],model_names=["Logistic Regression"])
results

### 3. Get the best parameters

In [None]:
fitted_model=results['fitted_model']

# Get the best model
best_model = fitted_model.stages[-1].bestModel

# Print the best parameters
print(f"Best parameters for Logistic regression:")

for param, value in best_model.extractParamMap().items():
     print(f"  {param.name}: {value}")

### 4. Save the best model

In [None]:
best_model.save('output/news_categorization_model')

24/06/04 20:03:43 WARN TaskSetManager: Stage 216 contains a task of very large size (33450 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

## VII- Summary

In this notebook we have studied two models for our news categorization task. There are Naive Bayes and Logistic regression.

 Our study reveals that the Logistic regression was the one with best performance.

 Then we tunned the Logistic regression hyperparameters using grid search and then we find the best model that we save.

 The next step of our work will be to ...

In [None]:
#df.unpersist()