In [None]:
!pip install pyspark

In [14]:
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf,col
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
import pyspark

# tools
import random
import os
import statistics
import pandas as pd

In [15]:
!java -version # should be jdk 11

java version "11.0.16" 2022-07-19 LTS
Java(TM) SE Runtime Environment 18.9 (build 11.0.16+11-LTS-199)
Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.16+11-LTS-199, mixed mode)


In [16]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer

# Initialise


In [17]:
"""
Initialize Spark session object
"""
spark = SparkSession \
    .builder \
    .appName("Python Spark Naive Bayes CountVectorizer") \
    .getOrCreate()

In [18]:
data = spark.read.csv(os.path.join(os.getcwd(),"..","data/preprocessed_data.csv"), header=True)
# data = spark.read.csv("preprocessed_data.csv", header=True)
data.show(20)

+--------------------+--------------------+--------------------+--------------------+------+
|            Datetime|               Title|         Description|             Content|Change|
+--------------------+--------------------+--------------------+--------------------+------+
|2019-01-01 20:56:...|Car sales languis...|Tight liquidity, ...|nonetight,liquidi...|  -1.2|
|2019-01-01 18:07:...|Tata Motors domes...|The company said ...|nonethe,company,p...|  -1.2|
|2019-01-02 20:30:...|An evening walk d...|At the close of m...|weak,global,cue,d...| -1.75|
|2019-01-02 14:12:...|Top auto trends o...|The biggest news ...|automotive,indust...|  -1.2|
|2019-01-04 16:18:...|No decision regar...|Economic Affairs ...|decision,recently...| -0.51|
|2019-01-03 14:37:...|HDFC MF surpasses...|As of December-en...|hdfc,mutual,fund,...| -2.85|
|2019-01-03 15:14:...|Axle load norms b...|Sales of medium a...|new,axle,load,nor...|  1.15|
|2019-01-04 09:46:...|Allocate 30-40% o...|For aggressive in...|tell,s

In [19]:
'''
Content was pre-processed and stored as comma seperated String
'''
def to_label(change):
      try:
        if change>0:
            return 1
        else:
            return 0
      except:
        pass

strList_toList = udf(lambda r: r.split(','), ArrayType(StringType()))
func_tolabel = udf(lambda x : to_label(x) ,IntegerType())

list_content = data.withColumn('Content',strList_toList('Content')).withColumn('label',col("Change").cast("Float")).na.drop("any")
list_content = list_content.withColumn('label',func_tolabel('label'))

list_content.show(20)

+--------------------+--------------------+--------------------+--------------------+------+-----+
|            Datetime|               Title|         Description|             Content|Change|label|
+--------------------+--------------------+--------------------+--------------------+------+-----+
|2019-01-01 20:56:...|Car sales languis...|Tight liquidity, ...|[nonetight, liqui...|  -1.2|    0|
|2019-01-01 18:07:...|Tata Motors domes...|The company said ...|[nonethe, company...|  -1.2|    0|
|2019-01-02 20:30:...|An evening walk d...|At the close of m...|[weak, global, cu...| -1.75|    0|
|2019-01-02 14:12:...|Top auto trends o...|The biggest news ...|[automotive, indu...|  -1.2|    0|
|2019-01-04 16:18:...|No decision regar...|Economic Affairs ...|[decision, recent...| -0.51|    0|
|2019-01-03 14:37:...|HDFC MF surpasses...|As of December-en...|[hdfc, mutual, fu...| -2.85|    0|
|2019-01-03 15:14:...|Axle load norms b...|Sales of medium a...|[new, axle, load,...|  1.15|    1|
|2019-01-0

## Hashing-TF Naive Bayes

In [20]:
'''
HashingTF: Create Document-Term Matrix
'''
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

hashingTF = HashingTF(inputCol="Content", outputCol="rawFeatures", numFeatures=50)
featurizedData = hashingTF.transform(list_content)

featurizedData.show()

'''
TF-IDF, followed from HashingTF.
'''
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

+--------------------+--------------------+--------------------+--------------------+------+-----+--------------------+
|            Datetime|               Title|         Description|             Content|Change|label|         rawFeatures|
+--------------------+--------------------+--------------------+--------------------+------+-----+--------------------+
|2019-01-01 20:56:...|Car sales languis...|Tight liquidity, ...|[nonetight, liqui...|  -1.2|    0|(50,[0,1,2,3,4,5,...|
|2019-01-01 18:07:...|Tata Motors domes...|The company said ...|[nonethe, company...|  -1.2|    0|(50,[6,7,8,10,11,...|
|2019-01-02 20:30:...|An evening walk d...|At the close of m...|[weak, global, cu...| -1.75|    0|(50,[0,1,2,3,4,5,...|
|2019-01-02 14:12:...|Top auto trends o...|The biggest news ...|[automotive, indu...|  -1.2|    0|(50,[0,1,2,3,4,5,...|
|2019-01-04 16:18:...|No decision regar...|Economic Affairs ...|[decision, recent...| -0.51|    0|(50,[0,1,2,3,4,5,...|
|2019-01-03 14:37:...|HDFC MF surpasses.

In [21]:
hashing_selected_data = rescaledData.select('Datetime','features', 'label')
hashing_selected_data.show()

+--------------------+--------------------+-----+
|            Datetime|            features|label|
+--------------------+--------------------+-----+
|2019-01-01 20:56:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-01 18:07:...|(50,[6,7,8,10,11,...|    0|
|2019-01-02 20:30:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-02 14:12:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-04 16:18:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-03 14:37:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-03 15:14:...|(50,[0,1,2,3,4,6,...|    1|
|2019-01-04 09:46:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-07 10:28:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-07 14:55:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-07 17:26:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-08 12:47:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-09 11:56:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-11 09:34:...|(50,[0,1,2,3,4,5,...|    0|
|2019-01-10 18:37:...|(50,[0,2,3,5,6,7,...|    0|
|2019-01-11 18:12:...|(50,[1,2,3,4,5,6,...|    0|
|2019-01-11 15:39:...|(50,[0,1,2,6,7,8,...|    0|


In [22]:
'''
Naive-Bayes following from TF-IDF
'''
def NAIVEBAYES_HASH(smooth=0, model_type=0):
  # separating training/test sets
  training_zero, test_zero = hashing_selected_data.where(hashing_selected_data.label == 0).randomSplit([0.7, 0.3])
  training_one, test_one = hashing_selected_data.where(hashing_selected_data.label == 1).randomSplit([0.7, 0.3])

  training = training_zero.union(training_one)
  test = test_zero.union(test_one)

  nb = NaiveBayes(smoothing=smooth, modelType=model_type)
  model_NB = nb.fit(training)

  # display on test set: appends a prediction column
  predictions = model_NB.transform(test)

  # compute accuracy of on test set: compares labelCol and predictionCol
  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
  accuracy = evaluator.evaluate(predictions)

  return (accuracy,model_NB)

In [23]:
import statistics

extract_method = "HashingTF"
iter_each = 10
iter_total = 5
m_types = ["complement", "multinomial"]
means = []

for model_type in m_types:
  for k in range(iter_total):
    accuracies = []
    smoothing = random.uniform(0.01, 0.8)
    for i in range(iter_each):
      accuracies.append(NAIVEBAYES_HASH(smoothing, model_type)[0])
    mean = statistics.mean(accuracies)
    print("=> Mean:", mean, "- Smoothing:", smoothing, "- Model:", model_type)
    means.append((mean, smoothing, model_type, extract_method))

=> Mean: 0.5460549501671508 - Smoothing: 0.664889788784094 - Model: complement
=> Mean: 0.5503600726443918 - Smoothing: 0.5943484655611615 - Model: complement
=> Mean: 0.5523826117511923 - Smoothing: 0.6279686359705199 - Model: complement
=> Mean: 0.5464857533034284 - Smoothing: 0.5199353280214871 - Model: complement
=> Mean: 0.5396050484937208 - Smoothing: 0.6100087451276399 - Model: complement
=> Mean: 0.8824020760925061 - Smoothing: 0.7599031751038198 - Model: multinomial
=> Mean: 0.8535205803914235 - Smoothing: 0.25683969456403133 - Model: multinomial
=> Mean: 0.8764646763204045 - Smoothing: 0.5635865136957934 - Model: multinomial
=> Mean: 0.856717305577901 - Smoothing: 0.38104190566783475 - Model: multinomial
=> Mean: 0.8662366199835855 - Smoothing: 0.45142852671420913 - Model: multinomial


In [24]:
acc_df = pd.DataFrame(means, columns=['mean', 'smoothing', 'model_type', 'extract_method'])
acc_df.to_csv("means_hashing.csv")

In [25]:
# from google.colab import files
# files.download('means_hash.csv')

## saving the model

In [26]:
model_path = os.path.join(os.path.dirname(os.getcwd()),'saved_models/htfnb_model')

## saving the highest accuracy model 
max_acc = acc_df.loc[acc_df['mean'].idxmax()]
smoothing = max_acc.smoothing 
model_type = max_acc.model_type

(_, final_model) = NAIVEBAYES_HASH(smooth=smoothing, model_type=model_type) 

In [28]:
final_model.write().overwrite().save(model_path)