In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=ce524e6b261d3c4f86eec1ca1cc19106341a686bccf8b498ee4e05bdc4dc2cad
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


# News categorization


## I- Modules import

In [6]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import  IDF, HashingTF
from pyspark.ml import  Pipeline
from math import ceil,log2
from pyspark.ml.classification import LogisticRegression,NaiveBayes,LogisticRegressionModel
from pyspark.sql.functions import col,explode,split
import numpy as np

## II- Spark context and session creation

In [None]:
spark = (SparkSession.builder
    .master("spark://node02:7077")
    .appName("NewsCategorisation")
    #.config('spark.driver.cores','4')
    #.config("spark.executorEnv.LD_LIBRARY_PATH", "/home/team1337/.local/easybuild_new/software/Python/3.8.6-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/libffi/3.3-GCCcore-10.2.0/lib64:/home/team1337/.local/easybuild_new/software/GMP/6.2.0-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/XZ/5.2.5-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/SQLite/3.33.0-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/Tcl/8.6.10-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/libreadline/8.0-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/ncurses/6.2-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/bzip2/1.0.8-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/binutils/2.35-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/zlib/1.2.11-GCCcore-10.2.0/lib:/home/team1337/.local/easybuild_new/software/GCCcore/10.2.0/lib64") \
    #.config("spark.pyspark.python", "/home/team1337/.local/easybuild_new/software/Python/3.8.6-GCCcore-10.2.0/bin/python3") \
    .getOrCreate()
        )
spark

## III- Dataframe preparation

### 1. Load the data

In [None]:
# Load data
df = spark.read.parquet("input/news.parquet", header=True, inferSchema=True)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

### 2. Partition and cache the dataframe

In [None]:
df.rdd.getNumPartitions()

9

In [None]:
num_partitions=5*40
df= df.repartition(num_partitions).cache()

In [None]:
df.rdd.getNumPartitions()



200

### 3. Preview the data

In [None]:
df.count()

                                                                                

1716608

In [None]:
df.show()

+--------------+--------------------+
|category_label|description_filtered|
+--------------+--------------------+
|          10.0|every bozo need k...|
|          11.0|best redness past...|
|          10.0|fashion show used...|
|           9.0|challenge present...|
|          10.0|man vintage show ...|
|          11.0|hidden mickey spo...|
|          10.0|next fashion uppe...|
|          11.0|work home rabbi d...|
|           9.0|mommy manage nt g...|
|           9.0|ontogenesis hormo...|
|          10.0|7 gross grooming ...|
|           9.0|period display bo...|
|          10.0|transformation ph...|
|          10.0|morena baccarin g...|
|          11.0|expat recovery ro...|
|          11.0|amsterdam diverse...|
|          11.0|view afar make ma...|
|          10.0|new house york fa...|
|           9.0|5 dumbest affair ...|
|           9.0|   valentine day kid|
+--------------+--------------------+
only showing top 20 rows



In [None]:
df.printSchema()

### 4. Convert filtered descriptions to arrays

In [None]:
# Create a new DataFrame with description_filtered as arrays
df= df.withColumn('description_filtered', split(col('description_filtered'), ' '))
# Show the new DataFrame
df.show(truncate=False)

+--------------+----------------------------------------------------------------------------------------------------------------+
|category_label|description_filtered                                                                                            |
+--------------+----------------------------------------------------------------------------------------------------------------+
|10.0          |[every, bozo, need, know, fashion]                                                                              |
|11.0          |[best, redness, pasta, sauce, italian, love, life, ve, never, heard]                                            |
|10.0          |[fashion, show, used, represent, raucous, liaison, video]                                                       |
|9.0           |[challenge, present, girl, constitute, dying]                                                                   |
|10.0          |[man, vintage, show, delivers, carefully, curated, habiliment, culled, aro

## IV- Feature Engineering


### 1. Explode the filtered descriptions to get the words

In [None]:
exploded_df=df.select(explode(df.description_filtered)).alias('words')
exploded_df.show()

In [None]:
#df=df.unpersist()

### 2. Get unique words in the filtered_description

In [None]:
unique_words=exploded_df.distinct()

### 3. Cache and show the unique words dataframe

In [None]:
unique_words=unique_words.cache()
unique_words.show()

### 4. Get the vocabulary size

In [None]:
vocabulary_size=unique_words.count()
vocabulary_size

128622

### 5. Unpersit the unique words dataframe(not needed anymore)

In [None]:
unique_words=unique_words.unpersist()

### 6. Get the smallest `n` such that $2^n$ is greater than `vocabulary_size`

In [None]:
num_features=log2(vocabulary_size)


In [None]:
n=ceil(log2(vocabulary_size))
n

17

### 7. Get the number of features for HashingTF

In [None]:
num_features=2**n
num_features

### 8. Define the HashingTF and IDF stages

In [None]:
# Define the HashingTF and IDF stages
hashingTF = HashingTF(inputCol="description_filtered", outputCol="rawFeatures", numFeatures=num_features)
idf = IDF(inputCol="rawFeatures", outputCol="features")

## V- Models set up, training and evaluation

### 1. Create a function to set up pipelines

We will create a function which set up and returns  the pipelines of the following transformations for Native Bayes and Linear reggression

- HashingTF
- IDF
- 3-Fold Cross-validation  with eventually grid search

Our function takes as optional argument a boolean with False as default value, which specifies wether to perform grid search or not.

In [7]:
def set_up_pipelines(grid_search=False):

    # Define the classifiers

    # Logistic regression classifier
    lr = LogisticRegression(labelCol="category_label", featuresCol="features")

    # Naive Bayes classifier
    nb = NaiveBayes(labelCol="category_label", featuresCol="features")

    # Define parameter grids
    paramGrid_nb = (ParamGridBuilder()
        .addGrid(nb.smoothing, [0.5, 1.0, 2.0])
        .build())

    paramGrid_nb=paramGrid_lr=ParamGridBuilder()

    if grid_search:
        # Define parameter grids for Native Bayes grid search
        reg_values = np.logspace(-4, 4, num=100)
        l1_ratios = np.linspace(0, 1, num=10)

        # Add  parameters to the grid
        paramGrid_nb=paramGrid_nb.addGrid(nb.smoothing, [0.5, 1.0, 2.0])
        paramGrid_lr=paramGrid_lr.addGrid(lr.regParam, reg_values)

    # Build the parmaeters grids
    paramGrid_nb = paramGrid_nb.build()
    paramGrid_lr = paramGrid_lr.build()





    # Create cross validators

    # Cross-validation for Naive Bayes
    cv_nb = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid_nb,
                        evaluator=MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy"),
                        numFolds=3, parallelism=1)
    # Cross-validation for Logistic Regression
    cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid_lr,
                        evaluator=MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy"),
                        numFolds=3, parallelism=1)


    # Create pipelines
    # Pipeline for Naive Bayes
    pipeline_nb = Pipeline(stages=[hashingTF, idf, cv_nb])
    # Pipeline for Logistic Regression
    pipeline_lr = Pipeline(stages=[hashingTF, idf, cv_lr])

    # Return the pipelines
    return pipeline_nb, pipeline_lr


### 2. Split the data

First of all let us split the data into train and test set: 80% for train and 20% for test

In [None]:
# Split data
(train_set, test_set) = df.randomSplit([0.8, 0.2], seed=0)

### 3. Create a function for model training

Let us create a function which takes as argument a model that it trains and then returns the trained model.

In [None]:
def train_model(model):
    print('Training the model')
    fitted_model = model.fit(train_set)
    print('Done')
    return fitted_model

### 4. Define a function to evaluate the model

The function takes as parameter a fitted model, evaluates the model on train and test split and then return the train and test performance. The accuracy is the metric used.

In [None]:
# Initialize the evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy")

# Function to evaluate model and get best parameters
def evaluate_model(fitted_model):

    print('Making predictions on the training set')

    train_predictions = fitted_model.transform(train_set)

    print('Making predictions on the test set')
    test_predictions = fitted_model.transform(test_set)

    print('Evaluating the model on training set')
    train_accuracy = evaluator.evaluate(train_predictions)

    print('Evaluating the model on test set')
    test_accuracy = evaluator.evaluate(test_predictions)
    return train_accuracy, test_accuracy

In [None]:
# Function to evaluate model and get best parameters
def evaluate_model(model, model_name):
    print('Training the model')

    # Train the model using cross-validation
    fitted_model = model.fit(train_set)

    # Get the best model from cross-validation
    best_model = fitted_model.stages[-1].bestModel

    print('Making predictions on the training set')
    # Make predictions on the training set
    train_predictions = fitted_model.transform(train_set)

    print('Making predictions on the test set')
    # Make predictions on the test set
    test_predictions = fitted_model.transform(test_set)

    # Initialize the evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="category_label", predictionCol="prediction", metricName="accuracy")

    print('Evaluating the model on training set')
    # Evaluate the model on the training set
    train_accuracy = evaluator.evaluate(train_predictions)

    print('Evaluating the model on test set')
    # Evaluate the model on the test set
    test_accuracy = evaluator.evaluate(test_predictions)

    print(f"{model_name} Train Accuracy: {train_accuracy}")
    print(f"{model_name} Test Accuracy: {test_accuracy}")

    # Print the best parameters
    print(f"Best parameters for {model_name}:")

    for param, value in best_model.extractParamMap().items():
        print(f"  {param.name}: {value}")

    return train_accuracy, test_accuracy,best_model

### 5. Call the functions and interpret the results

#### a Set up the pipelines

In [None]:
# Create the pipelins
model_pipelines  = set_up_pipelines()

In [None]:
# Naive Bayes pipeline
model_pipelines[0]

In [None]:
# Logistic regression pipeline
model_pipelines

#### b. Train the two models

In [None]:
results = {}

model_names = ["Naive Bayes", "Logistic Regression"]

# Initialize the results dictionary
results = []

# Loop over the indices and model names simultaneously
for idx, (model_pipeline, model_name) in enumerate(zip(model_pipelines, model_names)):
    results[idx] = {'model_name': model_name, 'pipeline': model_pipeline}

# Print results to verify
for idx in results:
    print(f"Index: {idx}, Model Name: {results[idx]['model_name']}, Pipeline: {value['pipeline']}")





    nb_fitted_model= train_model(pipeline_nb)
    lr_fitted_model = train_model(pipeline_nb)


    print(f'Evaluating {model_name}')
    train_accuracy, test_accuracy,best_model = evaluate_model()
    results[model_name] = {"train_accuracy": train_accuracy, "test_accuracy": test_accuracy}


In [None]:
best_model.save('pp')

24/06/04 20:03:43 WARN TaskSetManager: Stage 216 contains a task of very large size (33450 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [None]:
test_set=te
loaded_model = NaiveBayesModel.load('pp')

24/06/04 20:31:50 ERROR Instrumentation: java.lang.IllegalArgumentException: requirement failed: Error loading metadata: Expected class name org.apache.spark.ml.PipelineModel but found class name org.apache.spark.ml.classification.NaiveBayesModel
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.util.DefaultParamsReader$.parseMetadata(ReadWrite.scala:610)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:588)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.$anonfun$load$3(Pipeline.scala:269)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:268)
	at org.apache.spark.ml.PipelineModel$PipelineModelReader.$anonfun$load$7(Pipeline.scala:356)
	at org.apache.spark.ml.MLEvents.withLoadInstanceEvent(events

IllegalArgumentException: requirement failed: Error loading metadata: Expected class name org.apache.spark.ml.PipelineModel but found class name org.apache.spark.ml.classification.NaiveBayesModel

In [None]:
n = 17  # Assuming n is defined somewhere
hashingTF = HashingTF(inputCol="description_filtered", outputCol="rawFeatures", numFeatures=2**n)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Apply HashingTF to the test set
test_set_with_raw_features = hashingTF.transform(test_set)

# Fit IDF on the training set to get the IDF model
idf_model = idf.fit(test_set_with_raw_features)

# Apply IDF to the test set
test_set_with_features = idf_model.transform(test_set_with_raw_features)

# Show the test set with the new features column
test_set_with_features.select("description_filtered", "rawFeatures", "features").show(5, truncate=False)

In [None]:
loaded_model=NaiveBayesModel.load('pp')
# Verify the model is loaded by checking its parameters or making predictions
print("Loaded model parameters:")
for param, value in loaded_model.extractParamMap().items():
    print(f"  {param.name}: {value}")

# Making predictions on a new dataset
# Assuming test_set is your test DataFrame
predictions = loaded_model.transform(test_set_with_features)

[Stage 238:>                                                        (0 + 1) / 1]

Loaded model parameters:
  featuresCol: features
  labelCol: category_label
  modelType: multinomial
  predictionCol: prediction
  probabilityCol: probability
  rawPredictionCol: rawPrediction
  smoothing: 1.0


                                                                                

In [None]:
predictions.select('description_filtered','prediction').show()

24/06/04 20:45:14 WARN DAGScheduler: Broadcasting large task binary with size 34.1 MiB


+--------------------+----------+
|description_filtered|prediction|
+--------------------+----------+
|[apple, watch, ev...|      26.0|
|[arthveda, fund, ...|       0.0|
|[bank, india, hea...|       0.0|
|[bank, looking, a...|       0.0|
|[bank, may, accel...|       0.0|
|[better, liquidit...|       0.0|
|[bharti, axa, lif...|       0.0|
|[bill, seek, repl...|       0.0|
|[central, bank, a...|       0.0|
|[cleanup, exercis...|       0.0|
|[court, toss, 21,...|      27.0|
|[credit, growth, ...|       0.0|
|[credit, rating, ...|       0.0|
|[deal, ergos, sta...|       0.0|
|[describes, brexi...|       2.0|
|[draftkings, repo...|       0.0|
|[farmer, longer, ...|       0.0|
|[financial, intel...|       0.0|
|[firstquarter, ad...|       0.0|
|[given, poor, fis...|       0.0|
+--------------------+----------+
only showing top 20 rows



In [None]:
#df.unpersist()