# News Topic modeling


## I- Modules import

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import  IDF, HashingTF,CountVectorizer
from pyspark.ml import  Pipeline
from math import ceil,log2
from pyspark.ml.classification import LogisticRegression,NaiveBayes,LogisticRegressionModel
from pyspark.sql.functions import col,explode,split

import numpy as np
from pyspark.ml.clustering import LDA

## II- Spark context and session creation

In [34]:
spark = (SparkSession.builder
    .master("spark://node02:7077")
    .appName("NewsTopicModeling")
    .getOrCreate()
        )
spark

## III- Dataframe preparing

### 1. Load the data

In [35]:
# Load data
df = spark.read.parquet("input/news.parquet", header=True, inferSchema=True)

[Stage 143:>              (0 + 10) / 10][Stage 269:>              (0 + 10) / 10]

### 2. Partition and cache the dataframe

In [36]:
df.rdd.getNumPartitions()

9

In [37]:
num_partitions=4*20
df= df.repartition(num_partitions).cache()

In [38]:
df.rdd.getNumPartitions()

[Stage 143:>              (0 + 10) / 10][Stage 269:>              (0 + 10) / 10]

80

### 3. Preview the data

In [39]:
df.count()

                                                                                

1716608

In [40]:
df.show()

+--------------+--------------------+
|category_label|description_filtered|
+--------------+--------------------+
|           4.0|covid19 wrap worl...|
|           2.0|research reveals ...|
|           2.0|annastacia palasz...|
|           3.0|growing city hall...|
|           4.0|million philippin...|
|           4.0|youngest assailan...|
|           2.0|new threatened sp...|
|           2.0|polluting elite r...|
|           2.0|ed miliband shado...|
|           3.0|obama pledge stop...|
|           4.0|san aotea ii crew...|
|           3.0|russia appears de...|
|           4.0|bharat transport ...|
|           2.0|restore trust bla...|
|           3.0|huffpollster batt...|
|           3.0|send united incid...|
|           2.0|            20230928|
|           4.0|america benefit a...|
|           3.0|oklahoma approves...|
|           2.0|hear podcast wild...|
+--------------+--------------------+
only showing top 20 rows



In [41]:
df.printSchema()

root
 |-- category_label: double (nullable = true)
 |-- description_filtered: string (nullable = true)



[Stage 143:>              (0 + 10) / 10][Stage 269:>              (0 + 10) / 10]

### 4. Convert filtered descriptions to arrays

In [42]:
# Create a new DataFrame with description_filtered as arrays
df= df.withColumn('description_filtered', split(col('description_filtered'), ' '))
# Show the new DataFrame
df.show(truncate=False)

+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category_label|description_filtered                                                                                                                                                                                                                                                                                                                                                                       |
+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[Stage 143:>              (0 + 10) / 10][Stage 269:>              (0 + 10) / 10]

## IV- Feature Engineering


### 1. Explode the filtered descriptions to get the words

In [43]:
exploded_df=df.select(explode(df.description_filtered)).alias('words')
exploded_df.show()

+-----------+
|        col|
+-----------+
|    covid19|
|       wrap|
|      world|
|     reacts|
|     russia|
|    vaccine|
|        new|
|       case|
|      spell|
|uncertainty|
|   election|
|         nz|
|   research|
|    reveals|
|  lightning|
|     strike|
|     arctic|
|     region|
|    tripled|
|     summer|
+-----------+
only showing top 20 rows



                                                                                

In [44]:
#df=df.unpersist()

[Stage 143:>              (0 + 10) / 10][Stage 269:>              (0 + 10) / 10]

### 2. Get unique words in the filtered_description

In [45]:
unique_words=exploded_df.distinct()

### 3. Cache and show the unique words dataframe

In [46]:
unique_words=unique_words.cache()
unique_words.show()

[Stage 143:>              (0 + 10) / 10][Stage 269:>              (0 + 10) / 10]

+--------------+
|           col|
+--------------+
|          earl|
|       barrier|
|         still|
|          hope|
|       hydrate|
|         oscar|
|        filing|
|       carreys|
|        patton|
|    stateowned|
|        online|
|     soundness|
|   transaction|
|       gizelle|
|    indigenous|
|     involving|
|         inner|
|pacificcalling|
|        wields|
|           art|
+--------------+
only showing top 20 rows



                                                                                

### 4. Get the vocabulary size

In [47]:
vocabulary_size=unique_words.count()
vocabulary_size

                                                                                

128622

[Stage 143:>              (0 + 10) / 10][Stage 269:>              (0 + 10) / 10]

### 7. Get the number of features for HashingTF

In [29]:
num_features=2**n
num_features

131072

### 5. Define the CountVectorizer

In [89]:
cv = CountVectorizer(inputCol="description_filtered", outputCol="features", vocabSize=vocabulary_size, minDF=3.0)
# train the model
cv_model = cv.fit(df)
# transform the data. Output column name will be features.
vectorized_tokens = cv_model.transform(df)
vectorized_tokens.show(truncate=False)

24/06/06 00:42:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:40 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:40 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:42 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:42 WARN DAGScheduler: Broadcasting larg

+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category_label|description_filtered                                                                                                                                                                                                    

24/06/06 00:42:43 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:43 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:43 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:43 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:44 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:44 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:44 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:45 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:45 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:45 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:46 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:46 WARN DAGScheduler: Broadcasting larg

## V- Models set up, training and evaluation

### 1. Set up LDA model

In [94]:
#num_topics = 20
#lda = LDA(k=num_topics, maxIter=10)
lda = LDA(seed=0)

### 2. Set up pipelines

We will  set up and returns  the pipelines of the following transformations for Native Bayes and Linear reggression

- HashingTF
- IDF
- 3-Fold Cross-validation  without grid search

In [76]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import LDA

# Assuming you have a DataFrame `data` with a column named "text" containing text documents

# Configure an ML pipeline, which consists of several stages: vectorization, LDA, and evaluation
vectorizer = CountVectorizer(inputCol="description_filtered", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
lda = LDA(featuresCol="features")


# Define parameter grid for cross-validation
param_grid = (ParamGridBuilder() 
    .addGrid(lda.k, [3, 5, 7]) # Define different numbers of topics to try
    .addGrid(lda.maxIter, [10, 20, 30])   # Define different numbers of iterations
    .build())

# Create cross-validator
crossval = CrossValidator(estimator=lda,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=3)  

# Create pipeline for LDA
pipeline = Pipeline(stages=[vectorizer, idf, crossval])

pipeline

Pipeline_e2fc5f35ee3e

### 3. Split the data

First of all let us split the data into train and test set: 80% for train and 20% for test

In [77]:
# Split data
(train_set, test_set) = df.randomSplit([0.8, 0.2], seed=0)

### 4. Create a function for model training

Let us create a function which takes as argument a model that it trains and then returns the trained model.

In [78]:
def train_model(model):    
    return model.fit(train_set)

Exception in thread "serve RDD 622" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:551)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:519)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:65)


In [95]:
num_topics = 3
lda = LDA(k=num_topics, maxIter=10)
model = lda.fit(vectorized_tokens)

                                                                                

In [92]:
ll = model.logLikelihood(vectorized_tokens)
lp = model.logPerplexity(vectorized_tokens)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

ERROR:root:KeyboardInterrupt while sending command.>             (60 + 20) / 80]
Traceback (most recent call last):
  File "/home/gbetoho.adede/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/gbetoho.adede/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/team1337/.local/easybuild_new/software/Python/3.8.2-GCCcore-9.3.0/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [96]:
# extract vocabulary from CountVectorizer
vocab = cv_model.vocabulary
topics = model.describeTopics()
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
photo
state
new
trump
10
best
day
united
u
year
*************************
topic: 1
*************************
woman
video
nt
say
new
make
time
photo
get
world
*************************
topic: 2
*************************
new
covid19
nt
trump
day
coronavirus
bank
school
5
woman
*************************


In [101]:
new_data_topics = model.transform(vectorized_tokens)

# Show the resulting DataFrame with topic distributions
new_data_topics.select("description_filtered" ,"topicDistribution").show(truncate=False)

24/06/06 01:17:16 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB


+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------+
|description_filtered                                                                                                                                                                                                                                                                                                                                                                       |topicDistribution                                             |
+-------------------------------------------------------------------------------------------------------------

### 5. Define a function to evaluate the model

The function takes as parameter a fitted model, evaluates the model on train and test split and then return the train and test performance. The accuracy is the metric used.

In [86]:

# Custom evaluator for LDA using log-likelihood and perplexity
class LDAMetricsEvaluator:
    def __init__(self, featuresCol="features"):
        self.featuresCol = featuresCol

    def evaluate(self, model, dataset):
        log_likelihood = model.logLikelihood(dataset)
        perplexity = model.logPerplexity(dataset)
        # For simplicity, we can use negative log-likelihood as the evaluation metric
        return -log_likelihood
    

evaluator = LDAMetricsEvaluator(featuresCol="features")

# Function to evaluate model and get best parameters
def evaluate_model(fitted_model):

    print('Making predictions on the training set')

    train_predictions = fitted_model.transform(train_set)

    print('Making predictions on the test set')
    test_predictions = fitted_model.transform(test_set)

    print('Evaluating the model on training set')
    train_accuracy = evaluator.evaluate(train_predictions)

    print('Evaluating the model on test set')
    test_accuracy = evaluator.evaluate(test_predictions)
    return train_accuracy, test_accuracy

24/06/06 00:37:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:42 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:42 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:42 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.
24/06/06 00:37:42 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:42 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:42 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:43 WARN DAGScheduler: Broadcasting large task binary with size 3

### 6. Create a function which takes pipelines and train the models, evaluate them and then return the results

In [87]:
def train_and_evaluate_model(model_pipeline,model_name="LDA"):


    print(f"Training {model_name} model")

    # Fit the model pipeline to the training set
    #fitted_model = model_pipeline.fit(train_set)
    fitted_model = train_model(model_pipeline)

    print("Done")
    print(f"Evaluating {model_name} model")

    # Evaluate the fitted model
    train_accuracy, test_accuracy = evaluate_model(fitted_model,model_name)
    print("Done")
    
    # Store the results
    results = {
            'model_name': model_name,
            'fitted_model': fitted_model,
            "train_accuracy": train_accuracy,
            "test_accuracy": test_accuracy
        }

    

    return results

24/06/06 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:44 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB


In [88]:
results = train_and_evaluate_model(pipeline)

Training LDA model


24/06/06 00:37:45 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:45 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:45 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:45 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:46 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:46 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:46 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:46 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:47 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:47 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:48 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:37:48 WARN DAGScheduler: Broadcasting larg

24/06/06 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:05 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:06 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:06 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:06 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:06 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:06 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:07 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:07 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:07 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:07 WARN DAGScheduler: Broadcasting larg

IllegalArgumentException: prediction does not exist. Available: category_label, description_filtered, raw_features, features, CrossValidator_ffc12df31bb4_rand, topicDistribution

24/06/06 00:38:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:14 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.
24/06/06 00:38:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:15 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:15 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:15 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:15 WARN DAGScheduler: Broadcasting large task binary with size 3

24/06/06 00:38:34 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:34 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:34 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:37 WARN DAGScheduler: Broadcasting larg

24/06/06 00:38:55 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:55 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:55 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:56 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:56 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:56 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:56 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:56 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:57 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:57 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:57 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:38:57 WARN DAGScheduler: Broadcasting larg

24/06/06 00:39:16 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:16 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:16 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:16 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:17 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:17 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:17 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:17 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:17 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:17 WARN OnlineLDAOptimizer: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.
24/06/06 00:39:18 WARN DAGScheduler: Broadcasting large task binary with size 3

24/06/06 00:39:37 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:37 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:37 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:38 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:38 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:38 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:38 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:38 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:38 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:39 WARN DAGScheduler: Broadcasting larg

24/06/06 00:39:59 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:59 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:39:59 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:00 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:00 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:00 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:01 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:01 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:01 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:01 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:02 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:02 WARN DAGScheduler: Broadcasting larg

24/06/06 00:40:23 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:23 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:23 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:23 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:24 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:24 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:24 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:25 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:25 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:25 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:25 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:26 WARN DAGScheduler: Broadcasting larg

24/06/06 00:40:49 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:49 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:49 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:49 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:50 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:50 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:50 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:51 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:51 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:51 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:51 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:40:52 WARN DAGScheduler: Broadcasting larg

24/06/06 00:41:13 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:13 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:14 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:15 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:15 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:15 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:15 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:15 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:16 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:16 WARN DAGScheduler: Broadcasting larg

24/06/06 00:41:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:39 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:40 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:40 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:40 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:40 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:41 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:41:42 WARN DAGScheduler: Broadcasting larg

24/06/06 00:42:06 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:07 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:07 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:07 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:08 WARN BlockManager: Asked to remove block broadcast_2700_piece0, which does not exist
24/06/06 00:42:08 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:08 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:08 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:09 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:09 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:09 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:10 WARN DAGScheduler: 

24/06/06 00:42:34 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:34 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:37 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:37 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/06/06 00:42:37 WARN DAGScheduler: Broadcasting larg

In [None]:
results[0]

In [None]:
results[1]

We remark that
- Naive Bayes
- Logistic regression

We can then conclude that t
- he two models set a good performance on both training and test set.
- The Logistic regression models outperforms the Naive Bayes model

In the next section, we will tune the parameters of the Naive bayes to get the best parameters.

## VI- Logistic regression hyperparameters tuning

### 1. Pipeline creation

In [None]:
# Define parameter grids for Logistic regresion grid search
reg_values = np.logspace(-4, 4, num=100)
l1_ratios = np.linspace(0, 1, num=10)

paramGrid_lr=paramGrid_lr.addGrid(lr.regParam, reg_values).build()

# Create Cross-validation for Logistic Regression
cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_lr,
                        evaluator=MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy"),
                        numFolds=3, parallelism=1)


# Create pipeline for Logistic Regression
pipeline_lr = Pipeline(stages=[hashingTF, idf, cv_lr])

pipeline_lr

### 2. Hyperparameters tuning

In [None]:
results=train_and_evaluate_models(model_pipelines=[pipeline_lr],model_names=["Logistic Regression"])
results

### 3. Get the best parameters

In [None]:
fitted_model=results['fitted_model']

# Get the best model
best_model = fitted_model.stages[-1].bestModel

# Print the best parameters
print(f"Best parameters for Logistic regression:")

for param, value in best_model.extractParamMap().items():
     print(f"  {param.name}: {value}")

### 4. Save the best model

In [None]:
best_model.save('output/news_categorization_model')

24/06/04 20:03:43 WARN TaskSetManager: Stage 216 contains a task of very large size (33450 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

## VII- Summary

In this notebook we have studied two models for our news categorization task. There are Naive Bayes and Logistic regression.

 Our study reveals that the Logistic regression was the one with best performance.

 Then we tunned the Logistic regression hyperparameters using grid search and then we find the best model that we save.

 The next step of our work will be to ...

In [None]:
#df.unpersist()