# News Topic modeling


## I- Modules import

In [1]:
import os
home_dir = os.path.expanduser("~")
home_dir

# Construct the full paths to the JAR files
os.environ['SPARK_HOME']=os.path.join(home_dir, 'spark-3.2.0')
rapids_jar_path = os.path.join(home_dir, 'rapids-4-spark_2.12-21.12.0.jar')
cudf_jar_path = os.path.join(home_dir, 'cudf-21.12.2-cuda11.jar')
os.environ['JAVA_HOME'] = "/home/saturne.ayidegnon/.asdf/installs/java/openjdk-11"
os.environ['PYSPARK_SUBMIT_ARGS']=f"--jars {rapids_jar_path},{cudf_jar_path} --master local[*] pyspark-shell"
os.environ['PYSPARK_SUBMIT_ARGS']

import findspark
findspark.init()

In [2]:
#!pip install pyspark==3.2.0

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import  IDF, HashingTF,CountVectorizer
from pyspark.ml import  Pipeline
from math import ceil,log2
from pyspark.ml.classification import LogisticRegression,NaiveBayes,LogisticRegressionModel
from pyspark.sql.functions import col,explode,split

import numpy as np
from pyspark.ml.clustering import LDA

In [4]:
#spark.stop()

## II- Spark context and session creation

In [11]:
spark = (SparkSession.builder
.appName('SparkRAPIDS')
.config('spark.plugins','com.nvidia.spark.SQLPlugin')
.config ("spark.driver.memory", "64g")
.getOrCreate()
        )
spark.sparkContext.addPyFile(rapids_jar_path)
spark.sparkContext.addPyFile(cudf_jar_path)
spark.conf.set('spark.rapids.sql.enabled','true')
spark.conf.set('spark.rapids.sql.incompatibleOps.enabled', 'true')
spark.conf.set('spark.rapids.sql.format.csv.read.enabled', 'true')
spark.conf.set('spark.rapids.sql.format.csv.enabled', 'true')


spark.conf.set('spark.rapids.sql.format.parquet.read.enabled', 'true')
spark.conf.set('spark.rapids.sql.format.parquet.enabled', 'true')
spark 

24/06/08 01:45:32 WARN SparkContext: The path /home/saturne.ayidegnon/rapids-4-spark_2.12-21.12.0.jar has been added already. Overwriting of added paths is not supported in the current version.
24/06/08 01:45:32 WARN SparkContext: The path /home/saturne.ayidegnon/cudf-21.12.2-cuda11.jar has been added already. Overwriting of added paths is not supported in the current version.


In [10]:
#spark.stop()

In [6]:
print('Hello')

Hello


In [7]:
#!nvidia-smi

## III- Dataframe preparing

### 1. Load the data

In [8]:
# Load data
#df = spark.read.csv("input/news.csv", header=True, inferSchema=True)

In [16]:
path = "input"
df = spark.read.load(os.path.join(path, 'news.parquet'), format='parquet', sep=',', inferSchema=True, header=True)

In [17]:
#spark.stop()

### 2. Partition and cache the dataframe

In [18]:
df.rdd.getNumPartitions()

9

In [19]:
num_partitions=4*40
df= df.repartition(num_partitions).cache()

In [20]:
df.rdd.getNumPartitions()

160

### 3. Preview the data

In [21]:
df.count()

                                                                                

1716608

In [22]:
df.show()

+--------------+--------------------+
|category_label|description_filtered|
+--------------+--------------------+
|           0.0|10 leadership les...|
|           0.0|12000page charge ...|
|           0.0|201819 premium co...|
|           0.0|3 bank failed tak...|
|           0.0|4 investing lesso...|
|           0.0|5 tip successful ...|
|           0.0|7 reason starting...|
|           0.0|abbott india q1 n...|
|           0.0|according rbi dat...|
|           0.0|acute kidney inju...|
|           0.0|affect home auto ...|
|           0.0|ahmedabadbased sh...|
|           0.0|airtel partnering...|
|           0.0|allahabad bankled...|
|           0.0|although bank cre...|
|           0.0|america fastest g...|
|           0.0|amid speculation ...|
|           0.0|andhra bank poste...|
|           0.0|another reason st...|
|           0.0|application ha re...|
+--------------+--------------------+
only showing top 20 rows



In [23]:
df.printSchema()

root
 |-- category_label: double (nullable = true)
 |-- description_filtered: string (nullable = true)



### 4. Convert filtered descriptions to arrays

In [24]:
# Create a new DataFrame with description_filtered as arrays
df= df.withColumn('description_filtered', split(col('description_filtered'), ' '))
# Show the new DataFrame
df.show(truncate=False)

+--------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|category_label|description_filtered                                                                                                                       |
+--------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|0.0           |[10, leadership, lesson, u, commanding, general, john, e, michel]                                                                          |
|0.0           |[12000page, charge, sheet, claimed, similar, fraud, wa, detected, 2016, following, rbi, took, issue]                                       |
|0.0           |[201819, premium, collected, member, bank, wa, r, 12040, crore, commercial, bank, contributed, 93, per, cent]                              |
|0.0           |[3, bank, failed, take, timely, remedial, 

## IV- Feature Engineering


### 1. Explode the filtered descriptions to get the words

In [25]:
exploded_df=df.select(explode(df.description_filtered)).alias('words')
exploded_df.show()

+----------+
|       col|
+----------+
|        10|
|leadership|
|    lesson|
|         u|
|commanding|
|   general|
|      john|
|         e|
|    michel|
| 12000page|
|    charge|
|     sheet|
|   claimed|
|   similar|
|     fraud|
|        wa|
|  detected|
|      2016|
| following|
|       rbi|
+----------+
only showing top 20 rows



In [26]:
#df=df.unpersist()

### 2. Get unique words in the filtered_description

In [27]:
unique_words=exploded_df.distinct()

### 3. Cache and show the unique words dataframe

In [28]:
unique_words=unique_words.cache()
unique_words.show()



+-----------+
|        col|
+-----------+
|     36goal|
|        isp|
|    quatern|
|      2015a|
| onetimeuse|
|        467|
|    chairez|
|      hobbs|
|   linguini|
|   antilent|
|   monkfish|
|       lwor|
|       cbis|
|       koxa|
|   tortured|
|        kkr|
|     online|
| trezeguets|
|    nimbler|
|paracycling|
+-----------+
only showing top 20 rows



                                                                                

### 4. Get the vocabulary size

In [29]:
vocabulary_size=unique_words.count()
vocabulary_size

128622

### 5. Define the CountVectorizer and IDF stages

In [30]:
# Define the HashingTF and IDF stages
vectorizer = CountVectorizer(inputCol="description_filtered", outputCol="raw_features",vocabSize=vocabulary_size, minDF=3.0)
idf = IDF(inputCol="raw_features", outputCol="features")

## V- Models set up, training and evaluation

### 1. Set up LDA model

In [31]:
#num_topics = 20
#lda = LDA(k=num_topics, maxIter=10)
lda = LDA(featuresCol="features",seed=0)
lda

LDA_4a2474afb468

### 2. Set up pipelines

We will  set up the pipelines of the following transformations for Naive Bayes and Linear reggression

- CountVectorizer
- IDF

In [32]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import LDA
 
# Create pipeline for LDA
pipeline = Pipeline(stages=[vectorizer, idf, lda]) 


pipeline

Pipeline_9a379cbe3a44

### 3. Split the data

First of all let us split the data into train and test set: 80% for train and 20% for test

In [33]:
# Split data
(train_set, test_set) = df.randomSplit([0.8, 0.2], seed=0)

### 4. Create a function for model training

Let us create a function which takes as argument a model that it trains and then returns the trained model.

In [34]:
def train_model(model):    
    return model.fit(train_set)

In [35]:
fitted_model=train_model(pipeline)
fitted_model

24/06/08 01:46:28 WARN DAGScheduler: Broadcasting large task binary with size 1974.4 KiB
24/06/08 01:46:30 WARN DAGScheduler: Broadcasting large task binary with size 1974.5 KiB
24/06/08 01:46:31 WARN DAGScheduler: Broadcasting large task binary with size 1991.3 KiB
24/06/08 01:46:31 WARN DAGScheduler: Broadcasting large task binary with size 13.2 MiB
24/06/08 01:46:32 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/06/08 01:46:32 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/06/08 01:46:35 WARN DAGScheduler: Broadcasting large task binary with size 18.8 MiB
24/06/08 01:46:36 WARN DAGScheduler: Broadcasting large task binary with size 1991.3 KiB
24/06/08 01:46:36 WARN DAGScheduler: Broadcasting large task binary with size 13.2 MiB
24/06/08 01:46:38 WARN DAGScheduler: Broadcasting large task binary with size 18.8 MiB
24/06/08 01:46:38 WARN DAGScheduler: Broadcasting large task binary with size 1991.3 

PipelineModel_71ce7aaf14f4

### 5. Visualize the topics

In [36]:
fitted_vectirizer=fitted_model.stages[0]
vocabulary= fitted_vectirizer.vocabulary
len(vocabulary)

73468

In [37]:
vocabulary[:10]

['new', 'photo', 'state', 'trump', 'day', 'nt', 'say', 'woman', 'make', 'get']

In [38]:
topics = fitted_model.stages[-1].describeTopics()   
topics.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[191, 2, 7, 306, ...|[0.00386515694159...|
|    1|[50, 1, 74, 4, 32...|[0.00322483897564...|
|    2|[35, 11, 14, 1, 5...|[0.00463661178938...|
|    3|[5, 19, 41, 208, ...|[0.00345376318725...|
|    4|[2, 202, 0, 340, ...|[0.00387164342803...|
|    5|[21, 192, 92, 13,...|[0.00760989750144...|
|    6|[3, 36, 231, 339,...|[0.00656038181206...|
|    7|[9, 39, 266, 51, ...|[0.00321040978364...|
|    8|[26, 4, 0, 111, 7...|[0.00527198922910...|
|    9|[16, 30, 0, 55, 5...|[0.00680606310236...|
+-----+--------------------+--------------------+





In [39]:
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocabulary[idx] for idx in idx_list])\
       .collect()
topics_words[:2]

                                                                                

[['medium',
  'state',
  'woman',
  'social',
  'market',
  'republic',
  'global',
  'young',
  'new',
  'baby'],
 ['family',
  'photo',
  'food',
  'day',
  'best',
  'angstrom',
  'new',
  'relationship',
  'unit',
  'clarence']]

In [40]:
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
medium
state
woman
social
market
republic
global
young
new
baby
*************************
topic: 1
*************************
family
photo
food
day
best
angstrom
new
relationship
unit
clarence
*************************
topic: 2
*************************
police
video
5
photo
marriage
man
new
force
show
picture
*************************
topic: 3
*************************
nt
world
health
organization
new
trump
state
school
u
make
*************************
topic: 4
*************************
state
tree
new
race
right
bill
concern
say
5
life
*************************
topic: 5
*************************
bank
r
india
ha
space
said
state
crore
nt
20
*************************
topic: 6
*************************
trump
donald
clinton
hillary
best
new
nt
card
say
m
*************************
topic: 7
*************************
get
love
room
black
woman
photo
nt
way
sex
video
*************************
topic: 8
*************************
wedding
day
new
hour
college
sexua

### Get topics distributions

In [41]:
# Transform the training and test data
train_set_transformed = fitted_model.transform(train_set)
test_set_transformed = fitted_model.transform(test_set)

# Get the LDA model from the pipeline model
lda_model = fitted_model.stages[-1]

# Extract the topic distributions
train_topic_distributions = train_set_transformed.select("description_filtered", "topicDistribution")
test_topic_distributions = test_set_transformed.select("description_filtered", "topicDistribution")

In [42]:
# Show the topic distributions for the training set
train_topic_distributions.show(truncate=False)

# Show the topic distributions for the test set
test_topic_distributions.show(truncate=False)

24/06/08 01:47:18 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB


+-------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description_filtered                                                                                                                       |topicDistribution                                                                                                                                                                                                      |
+-------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|description_filtered                                                                                                                                                  |topicDistribution                                                                                                                                                                                                      |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------

24/06/08 01:47:19 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB


### 5. Define a function to evaluate the model

The function takes as parameter a fitted model, evaluates the model on train and test split and then return the train and test performance. The accuracy is the metric used.

In [43]:
# Function to evaluate model and get best parameters
def evaluate_model(fitted_model,data_transformed=[train_set_transformed,test_set_transformed]):
    
    print('Evaluating the model on training set')
    train_lp = fitted_model.logPerplexity(data_transformed[0])

    print('Evaluating the model on test set')
    test_lp = fitted_model.logPerplexity(data_transformed[1])
    
    print("The upper bound on perplexity for train set: " + str(train_lp))
    print("The upper bound on perplexity for test set: " + str(test_lp))
    return train_lp, test_lp

In [44]:
train_lp,test_lp=evaluate_model(fitted_model.stages[-1])
train_lp,test_lp

Evaluating the model on training set


24/06/08 01:47:19 WARN DAGScheduler: Broadcasting large task binary with size 1975.3 KiB
24/06/08 01:50:45 WARN DAGScheduler: Broadcasting large task binary with size 1976.0 KiB
                                                                                

Evaluating the model on test set


24/06/08 02:14:27 WARN DAGScheduler: Broadcasting large task binary with size 1975.3 KiB
24/06/08 02:15:20 WARN DAGScheduler: Broadcasting large task binary with size 1976.0 KiB

The upper bound on perplexity for train set: 9.075611646425005
The upper bound on perplexity for test set: 9.13477040948899


                                                                                

(9.075611646425005, 9.13477040948899)

In [None]:
results={}
num_topics_range=[20, 25, 30, 35, 40, 45, 50]

for num_topics in num_topics_range:
    print('LDA for k={}'.format(num_topics))
    # Create LDA
    lda = LDA(featuresCol="features",seed=0)
    # Create pipeline for LDA
    pipeline = Pipeline(stages=[vectorizer, idf, lda])
    print('Model training')
    # Train the model
    fitted_model=train_model(pipeline)
    print('Done')
    
    train_set_transformed = fitted_model.transform(train_set)
    test_set_transformed = fitted_model.transform(test_set)
    train_lp,test_lp=evaluate_model(fitted_model,data_transformed=[train_set_transformed,test_set_transformed])
    results[num_topics]=
    
    

### 6. Create a function which takes pipelines and train the models, evaluate them and then return the results

We remark that
- Naive Bayes
- Logistic regression

We can then conclude that t
- he two models set a good performance on both training and test set.
- The Logistic regression models outperforms the Naive Bayes model

In the next section, we will tune the parameters of the Naive bayes to get the best parameters.

## VI- Logistic regression hyperparameters tuning

### 1. Pipeline creation

In [None]:
# Define parameter grids for Logistic regresion grid search
reg_values = np.logspace(-4, 4, num=100)
l1_ratios = np.linspace(0, 1, num=10)

paramGrid_lr=paramGrid_lr.addGrid(lr.regParam, reg_values).build()

# Create Cross-validation for Logistic Regression
cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_lr,
                        evaluator=MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy"),
                        numFolds=3, parallelism=1)


# Create pipeline for Logistic Regression
pipeline_lr = Pipeline(stages=[hashingTF, idf, cv_lr])

pipeline_lr

### 2. Hyperparameters tuning

In [None]:
results=train_and_evaluate_models(model_pipelines=[pipeline_lr],model_names=["Logistic Regression"])
results

### 3. Get the best parameters

In [None]:
fitted_model=results['fitted_model']

# Get the best model
best_model = fitted_model.stages[-1].bestModel

# Print the best parameters
print(f"Best parameters for Logistic regression:")

for param, value in best_model.extractParamMap().items():
     print(f"  {param.name}: {value}")

### 4. Save the best model

In [None]:
best_model.save('output/news_categorization_model')

24/06/04 20:03:43 WARN TaskSetManager: Stage 216 contains a task of very large size (33450 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

## VII- Summary

In this notebook we have studied two models for our news categorization task. There are Naive Bayes and Logistic regression.

 Our study reveals that the Logistic regression was the one with best performance.

 Then we tunned the Logistic regression hyperparameters using grid search and then we find the best model that we save.

 The next step of our work will be to ...

In [None]:
#df.unpersist()