# Ropic modeling


In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=982bf4fa037f9d8515620684ab443ef327d1d7cdb7b285f940cbe5469e831b84
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


## I- Modules import

In [30]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import  IDF, HashingTF,CountVectorizer
from pyspark.ml import  Pipeline
from math import ceil,log2
from pyspark.ml.classification import LogisticRegression,NaiveBayes,LogisticRegressionModel
from pyspark.sql.functions import col,explode,split

import numpy as np
from pyspark.ml.clustering import LDA

# Import Spark NLP
#from sparknlp.base import *
#from sparknlp.annotator import *
#from sparknlp.pretrained import PretrainedPipeline
#import sparknlp
#from pyspark.ml.feature import CountVectorizer


## II- Spark context and session creation

In [7]:
spark = (SparkSession.builder
    #.master("spark://node02:7077")
    .appName("TopicModeling")
    #.config('spark.driver.cores','4')
    #.config("spark.executorEnv.LD_LIBRARY_PATH", "/home/team1337/.local/easybuild_new/software/Python/3.8.6-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/libffi/3.3-GCCcore-10.2.0/lib64:/home/team1337/.local/easybuild_new/software/GMP/6.2.0-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/XZ/5.2.5-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/SQLite/3.33.0-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/Tcl/8.6.10-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/libreadline/8.0-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/ncurses/6.2-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/bzip2/1.0.8-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/binutils/2.35-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/zlib/1.2.11-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/GCCcore/10.2.0/lib64") \
    #.config("spark.pyspark.python", "/home/team1337/.local/easybuild_new/software/Python/3.8.6-GCCcore-10.2.0/bin/python3") \
    .getOrCreate()
        )
spark

## III- Dataframe preparing

### 1. Load the data

In [12]:
# Load data
df = spark.read.parquet("input/news.parquet", header=True, inferSchema=True)

### 2. Partition and cache the dataframe

In [13]:
df.rdd.getNumPartitions()

2

In [14]:
num_partitions=5*2
df= df.repartition(num_partitions).cache()

In [15]:
df.rdd.getNumPartitions()

10

### 3. Preview the data

In [16]:
df.count()

1716608

In [17]:
df.show()

+--------------+--------------------+
|category_label|description_filtered|
+--------------+--------------------+
|          11.0|thirst game inspi...|
|          11.0|giant time herman...|
|           7.0|iphone 12 series ...|
|           8.0|8 new covid19 sli...|
|           2.0|country largest t...|
|          11.0|cambodia heartbre...|
|          15.0|foxiness day crea...|
|          11.0|world best summer...|
|           4.0|elon musk claim p...|
|          11.0|vine bridge nihon...|
|          10.0|raf herbert simon...|
|           9.0|5 steer setting b...|
|           6.0|weird food allerg...|
|          13.0|indigo girl exist...|
|           1.0|instagram launch ...|
|           8.0|provisional mansl...|
|          11.0|amsterdam evolvin...|
|           3.0|cost democracy lo...|
|           3.0|white house take ...|
|           3.0|donjon cuba surfa...|
+--------------+--------------------+
only showing top 20 rows



In [18]:
df.printSchema()

root
 |-- category_label: double (nullable = true)
 |-- description_filtered: string (nullable = true)



### 4. Convert filtered descriptions to arrays

In [19]:
# Create a new DataFrame with description_filtered as arrays
df= df.withColumn('description_filtered', split(col('description_filtered'), ' '))
# Show the new DataFrame
df.show(truncate=False)

+--------------+------------------------------------------------------------------------------------------------------+
|category_label|description_filtered                                                                                  |
+--------------+------------------------------------------------------------------------------------------------------+
|11.0          |[thirst, game, inspired, hotel, lionise, new, film]                                                   |
|11.0          |[giant, time, hermanus, due, south, africa]                                                           |
|7.0           |[iphone, 12, series, reportedly, support, beidou, navigation, news]                                   |
|8.0           |[8, new, covid19, slip, elgin, oxford, 3, middlesexlondon, mon]                                       |
|2.0           |[country, largest, tree, kauri, threatened, dieback, climate, change, hope, revered, specie]          |
|11.0          |[cambodia, heartbreaking

## IV- Feature Engineering


### 1. Explode the filtered descriptions to get the words

In [20]:
exploded_df=df.select(explode(df.description_filtered)).alias('words')
exploded_df.show()

+----------+
|       col|
+----------+
|    thirst|
|      game|
|  inspired|
|     hotel|
|   lionise|
|       new|
|      film|
|     giant|
|      time|
|  hermanus|
|       due|
|     south|
|    africa|
|    iphone|
|        12|
|    series|
|reportedly|
|   support|
|    beidou|
|navigation|
+----------+
only showing top 20 rows



In [21]:
#df=df.unpersist()

### 2. Get unique words in the filtered_description

In [22]:
unique_words=exploded_df.distinct()

### 3. Cache and show the unique words dataframe

In [23]:
unique_words=unique_words.cache()
unique_words.show()

+-----------+
|        col|
+-----------+
|       hope|
|     travel|
|  traveling|
|      still|
|     outfit|
|     laxity|
|        art|
|requirement|
|      oscar|
|      mammy|
|       pant|
| indigenous|
|    melodic|
| rejuvenate|
|     online|
|     lamott|
|  connected|
|   cautious|
|      crest|
|     monish|
+-----------+
only showing top 20 rows



### 4. Get the vocabulary size

In [24]:
vocabulary_size=unique_words.count()
vocabulary_size

128622

### 5. Unpersit the unique words dataframe(not needed anymore)

In [25]:
unique_words=unique_words.unpersist()

### 6. Get the smallest `n` such that $2^n$ is greater than `vocabulary_size`

In [28]:
#num_features=log2(vocabulary_size)


In [27]:
n=ceil(log2(vocabulary_size))
n

17

### 7. Get the number of features for HashingTF

In [29]:
num_features=2**n
num_features

131072

### 8. Define the HashingTF and IDF stages

In [None]:
# Define the HashingTF and IDF stages
#hashingTF = HashingTF(inputCol="description_filtered", outputCol="rawFeatures", numFeatures=num_features)
#idf = IDF(inputCol="rawFeatures", outputCol="features")

In [37]:
cv = CountVectorizer(inputCol="description_filtered", outputCol="features", vocabSize=80, minDF=3.0)
# train the model
cv_model = cv.fit(df)
# transform the data. Output column name will be features.
vectorized_tokens = cv_model.transform(df.limit(100))
vectorized_tokens.show(truncate=False)

+--------------+------------------------------------------------------------------------------------------------------+-----------------------------+
|category_label|description_filtered                                                                                  |features                     |
+--------------+------------------------------------------------------------------------------------------------------+-----------------------------+
|11.0          |[thirst, game, inspired, hotel, lionise, new, film]                                                   |(80,[0],[1.0])               |
|11.0          |[giant, time, hermanus, due, south, africa]                                                           |(80,[17],[1.0])              |
|7.0           |[iphone, 12, series, reportedly, support, beidou, navigation, news]                                   |(80,[53],[1.0])              |
|8.0           |[8, new, covid19, slip, elgin, oxford, 3, middlesexlondon, mon]                     

## V- Models set up, training and evaluation

### 1. Set up Naive and Logistic regression classifiers

In [None]:
# Define the classifiers

# Logistic regression classifier
lr = LogisticRegression(labelCol="category_label", featuresCol="features")

# Naive Bayes classifier
nb = NaiveBayes(labelCol="category_label", featuresCol="features")

### 2. Set up pipelines

We will  set up and returns  the pipelines of the following transformations for Native Bayes and Linear reggression

- HashingTF
- IDF
- 3-Fold Cross-validation  without grid search

In [None]:
# Define parameter grids
paramGrid_nb = (ParamGridBuilder()
        .addGrid(nb.smoothing, [0.5, 1.0, 2.0])
        .build())

paramGrid_nb=paramGrid_lr=ParamGridBuilder().build()

# Create cross validators

# Cross-validation for Naive Bayes
cv_nb = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid_nb,
                        evaluator=MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy"),
                        numFolds=3, parallelism=1)
# Cross-validation for Logistic Regression
cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_lr,
                        evaluator=MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy"),
                        numFolds=3, parallelism=1)


# Create pipelines
# Pipeline for Naive Bayes
pipeline_nb = Pipeline(stages=[hashingTF, idf, cv_nb])
# Pipeline for Logistic Regression
pipeline_lr = Pipeline(stages=[hashingTF, idf, cv_lr])
model_pipelines=pipeline_nb, pipeline_lr


### 3. Split the data

First of all let us split the data into train and test set: 80% for train and 20% for test

In [None]:
# Split data
(train_set, test_set) = df.randomSplit([0.8, 0.2], seed=0)

### 4. Create a function for model training

Let us create a function which takes as argument a model that it trains and then returns the trained model.

In [42]:
num_topics = 20
lda = LDA(k=num_topics, maxIter=10)
model = lda.fit(vectorized_tokens)
ll = model.logLikelihood(vectorized_tokens)
lp = model.logPerplexity(vectorized_tokens)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -3057.1994367408975
The upper bound on perplexity: 26.129909715734165


In [43]:
# extract vocabulary from CountVectorizer
vocab = cv_model.vocabulary
topics = model.describeTopics()
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
kid
state
love
picture
news
wa
marriage
week
video
police
*************************
topic: 1
*************************
republic
video
black
bank
marriage
help
family
week
wa
coronavirus
*************************
topic: 2
*************************
ha
college
5
covid19
united
need
thing
state
death
news
*************************
topic: 3
*************************
take
white
house
nt
find
donald
call
right
party
help
*************************
topic: 4
*************************
american
get
best
person
help
may
first
video
human
number
*************************
topic: 5
*************************
look
like
get
american
human
thing
video
right
want
year
*************************
topic: 6
*************************
7
ha
party
top
people
america
family
nt
kid
donald
*************************
topic: 7
*************************
say
wa
look
school
health
top
report
child
world
woman
*************************
topic: 8
*************************
people
may
marriage


### 5. Define a function to evaluate the model

The function takes as parameter a fitted model, evaluates the model on train and test split and then return the train and test performance. The accuracy is the metric used.

In [None]:
# Initialize the evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy")

# Function to evaluate model and get best parameters
def evaluate_model(fitted_model):

    print('Making predictions on the training set')

    train_predictions = fitted_model.transform(train_set)

    print('Making predictions on the test set')
    test_predictions = fitted_model.transform(test_set)

    print('Evaluating the model on training set')
    train_accuracy = evaluator.evaluate(train_predictions)

    print('Evaluating the model on test set')
    test_accuracy = evaluator.evaluate(test_predictions)
    return train_accuracy, test_accuracy

In [None]:
# Function to evaluate model and get best parameters
def evaluate_model(model, model_name):
    print('Training the model')

    # Train the model using cross-validation
    fitted_model = model.fit(train_set)

    # Get the best model from cross-validation
    best_model = fitted_model.stages[-1].bestModel

    print('Making predictions on the training set')
    # Make predictions on the training set
    train_predictions = fitted_model.transform(train_set)

    print('Making predictions on the test set')
    # Make predictions on the test set
    test_predictions = fitted_model.transform(test_set)

    # Initialize the evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy")

    print('Evaluating the model on training set')
    # Evaluate the model on the training set
    train_accuracy = evaluator.evaluate(train_predictions)

    print('Evaluating the model on test set')
    # Evaluate the model on the test set
    test_accuracy = evaluator.evaluate(test_predictions)

    print(f"{model_name} Train Accuracy: {train_accuracy}")
    print(f"{model_name} Test Accuracy: {test_accuracy}")

    # Print the best parameters
    print(f"Best parameters for {model_name}:")

    for param, value in best_model.extractParamMap().items():
        print(f"  {param.name}: {value}")

    return train_accuracy, test_accuracy,best_model

### 6. Create a function which takes pipelines and train the models, evaluate them and then return the results

In [None]:
def train_and_evaluate_models(model_pipelines,model_names=["Naive Bayes", "Logistic Regression"]):

    # Initialize the results dictionary
    results = {}

    # Loop over the indices and model names simultaneously
    for idx, (model_pipeline, model_name) in enumerate(zip(model_pipelines, model_names)):
        print(f"Training {model_name} model")

        # Fit the model pipeline to the training set
        fitted_model = model_pipeline.fit(train_set)

        print("Done")
        print(f"Evaluating {model_name} model")

        # Evaluate the fitted model
        train_accuracy, test_accuracy, best_model = evaluate_model(fitted_model)

        # Store the results
        results[idx] = {
            'model_name': model_name,
            #'pipeline': model_pipeline,
            'fitted_model': fitted_model,
            "train_accuracy": train_accuracy}

        if len(model_name)==0:
            results=results[0]

    return results

### 5. Call the function and interpret the results

In [None]:
results = train_and_evaluate_models()

In [None]:
results[0]

In [None]:
results[1]

We remark that
- Naive Bayes
- Logistic regression

We can then conclude that t
- he two models set a good performance on both training and test set.
- The Logistic regression models outperforms the Naive Bayes model

In the next section, we will tune the parameters of the Naive bayes to get the best parameters.

## VI- Logistic regression hyperparameters tuning

### 1. Pipeline creation

In [None]:
# Define parameter grids for Logistic regresion grid search
reg_values = np.logspace(-4, 4, num=100)
l1_ratios = np.linspace(0, 1, num=10)

paramGrid_lr=paramGrid_lr.addGrid(lr.regParam, reg_values).build()

# Create Cross-validation for Logistic Regression
cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_lr,
                        evaluator=MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy"),
                        numFolds=3, parallelism=1)


# Create pipeline for Logistic Regression
pipeline_lr = Pipeline(stages=[hashingTF, idf, cv_lr])

pipeline_lr

### 2. Hyperparameters tuning

In [None]:
results=train_and_evaluate_models(model_pipelines=[pipeline_lr],model_names=["Logistic Regression"])
results

### 3. Get the best parameters

In [None]:
fitted_model=results['fitted_model']

# Get the best model
best_model = fitted_model.stages[-1].bestModel

# Print the best parameters
print(f"Best parameters for Logistic regression:")

for param, value in best_model.extractParamMap().items():
     print(f"  {param.name}: {value}")

### 4. Save the best model

In [None]:
best_model.save('output/news_categorization_model')

24/06/04 20:03:43 WARN TaskSetManager: Stage 216 contains a task of very large size (33450 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

## VII- Summary

In this notebook we have studied two models for our news categorization task. There are Naive Bayes and Logistic regression.

 Our study reveals that the Logistic regression was the one with best performance.

 Then we tunned the Logistic regression hyperparameters using grid search and then we find the best model that we save.

 The next step of our work will be to ...

In [None]:
#df.unpersist()