In [None]:
!pip install pyspark

In [1]:
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf,col
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
import pyspark

# tools
import random
import os
import statistics
import pandas as pd

In [12]:
!java -version # java version 11.0.16 was used 

java version "11.0.16" 2022-07-19 LTS
Java(TM) SE Runtime Environment 18.9 (build 11.0.16+11-LTS-199)
Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.16+11-LTS-199, mixed mode)


In [3]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer

# Initialise

In [4]:
"""
Initialize Spark session object
"""
spark = SparkSession \
    .builder \
    .appName("Python Spark Naive Bayes CountVectorizer") \
    .getOrCreate()

In [5]:
data = spark.read.csv(os.path.join(os.getcwd(),"..","data/preprocessed_data.csv"), header=True)
# data = spark.read.csv("preprocessed_data.csv", header=True)
data.show(20)

+--------------------+--------------------+--------------------+--------------------+------+
|            Datetime|               Title|         Description|             Content|Change|
+--------------------+--------------------+--------------------+--------------------+------+
|2019-01-01 20:56:...|Car sales languis...|Tight liquidity, ...|nonetight,liquidi...|  -1.2|
|2019-01-01 18:07:...|Tata Motors domes...|The company said ...|nonethe,company,p...|  -1.2|
|2019-01-02 20:30:...|An evening walk d...|At the close of m...|weak,global,cue,d...| -1.75|
|2019-01-02 14:12:...|Top auto trends o...|The biggest news ...|automotive,indust...|  -1.2|
|2019-01-04 16:18:...|No decision regar...|Economic Affairs ...|decision,recently...| -0.51|
|2019-01-03 14:37:...|HDFC MF surpasses...|As of December-en...|hdfc,mutual,fund,...| -2.85|
|2019-01-03 15:14:...|Axle load norms b...|Sales of medium a...|new,axle,load,nor...|  1.15|
|2019-01-04 09:46:...|Allocate 30-40% o...|For aggressive in...|tell,s

In [6]:
'''
Content was pre-processed and stored as comma seperated String
'''
def to_label(change):
  try:
    if change>0:
      return 1
    else:
      return 0
  except:
    pass

strList_toList = udf(lambda r: r.split(','), ArrayType(StringType()))
func_tolabel = udf(lambda x : to_label(x) ,IntegerType())

list_content = data.withColumn('Content',strList_toList('Content')).withColumn('label',col("Change").cast("Float")).na.drop("any")
list_content = list_content.withColumn('label',func_tolabel('label'))

list_content.show(20)

+--------------------+--------------------+--------------------+--------------------+------+-----+
|            Datetime|               Title|         Description|             Content|Change|label|
+--------------------+--------------------+--------------------+--------------------+------+-----+
|2019-01-01 20:56:...|Car sales languis...|Tight liquidity, ...|[nonetight, liqui...|  -1.2|    0|
|2019-01-01 18:07:...|Tata Motors domes...|The company said ...|[nonethe, company...|  -1.2|    0|
|2019-01-02 20:30:...|An evening walk d...|At the close of m...|[weak, global, cu...| -1.75|    0|
|2019-01-02 14:12:...|Top auto trends o...|The biggest news ...|[automotive, indu...|  -1.2|    0|
|2019-01-04 16:18:...|No decision regar...|Economic Affairs ...|[decision, recent...| -0.51|    0|
|2019-01-03 14:37:...|HDFC MF surpasses...|As of December-en...|[hdfc, mutual, fu...| -2.85|    0|
|2019-01-03 15:14:...|Axle load norms b...|Sales of medium a...|[new, axle, load,...|  1.15|    1|
|2019-01-0

# CountVectoriser Naive Bayes 

### Vectorise the content

In [7]:
to_vectorize = list_content.select('Datetime','Content', 'label')
cv = CountVectorizer(inputCol="Content", outputCol="features")

model_vec = cv.fit(to_vectorize)
result_vec = model_vec.transform(to_vectorize)
selectedData = result_vec.select('features', 'label')

In [8]:
selectedData

DataFrame[features: vector, label: int]

### Building Naive Bayes Classification

In [9]:
"""
Define TruePositive, FalsePositive and FalseNegative
x = prediction, y = label
"""
TP = udf(lambda x,y: int(x==1 and y==1))
FP = udf(lambda x,y: int(x==1 and y==0))
FN = udf(lambda x,y: int(x==0 and y==1))

In [10]:
'''
Naive-Bayes following from CountVectorizer
'''
def NAIVEBAYES_CV(smooth=1, model_type="multinomial"): 
    
    # separating train/test data
    # taking equal number of positive and negative changes while splitting
    training_negative, test_negative = selectedData.where(selectedData.label == 0).randomSplit([0.7, 0.3])
    training_positive, test_positive = selectedData.where(selectedData.label == 1).randomSplit([0.7, 0.3])

    training = training_positive.union(training_negative)
    test = test_negative.union(test_positive)

    # create trainer with parameters then train
    # smoothing: smooth probabilities of 0 to the input
    nb = NaiveBayes(smoothing=smooth, modelType=model_type)
    model_NB = nb.fit(training)

    # display on test set: appends a prediction column
    predictions = model_NB.transform(test)
    
    # diagnostic testing
    prela_df = predictions.select("prediction","label")
    prela_df=prela_df.withColumn("TP", TP(prela_df.prediction,prela_df.label))
    prela_df=prela_df.withColumn("FP", FP(prela_df.prediction,prela_df.label))
    prela_df=prela_df.withColumn("FN", FN(prela_df.prediction,prela_df.label))
    
#     return prela_df

    TP_ = prela_df.where(prela_df.TP==1).count()
    FP_ = prela_df.where(prela_df.FP==1).count()
    FN_ = prela_df.where(prela_df.FN==1).count()

    precision = TP_/(TP_+FP_)
    recall = TP_/(TP_+FN_)
    F1 = 2*(precision*recall)/(precision+recall)

    # compute accuracy of on test set: compares labelCol and predictionCol
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    # return test results and model object
    return (accuracy,precision,recall,F1,model_NB,prela_df)


In [11]:
## Manually trying few examples

acc, precision, recall, F1, modelNB1, prela_df1 = NAIVEBAYES_CV(0.2684835187532758,"multinomial")
acc

0.7142857142857143

In [13]:
'''
Iteration tests on Naive-Bayes

iter_total: iterations for different smoothing nb
iter_each: iterations for the same smoothing nb
'''

extract_method = "CountVectorizer"
iter_each = 10
iter_total = 5
m_types = ["complement", "multinomial"]
accs = []
f1s = []
means = []
for model_type in m_types:
    for k in range(iter_total):
        accuracies = []
        smoothing = random.uniform(0.01, 0.8)
        for i in range(iter_each):
            acc,precision,recall,F1,modelNB,prela_df = NAIVEBAYES_CV(smoothing, model_type)
            accs.append(acc)
            f1s.append(F1)
        mean_acc = statistics.mean(accs)
        mean_f1 = statistics.mean(f1s)
        print("=> Mean_acc: ", mean_acc," => Mean_f1: ",mean_f1, "- Smoothing:", smoothing, "- Model:", model_type)
        means.append((mean_acc,mean_f1, smoothing, model_type, extract_method))

=> Mean_acc:  0.7766455295860981  => Mean_f1:  0.25787998569839504 - Smoothing: 0.5988610787445499 - Model: complement
=> Mean_acc:  0.749400271208196  => Mean_f1:  0.243141301784549 - Smoothing: 0.11283032778252543 - Model: complement
=> Mean_acc:  0.7558577164258549  => Mean_f1:  0.2592191839228742 - Smoothing: 0.04807813441729131 - Model: complement
=> Mean_acc:  0.7484376146777002  => Mean_f1:  0.23867267803335293 - Smoothing: 0.5097049072357274 - Model: complement


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\users\chhal\documents\discord_bot\newsbased-market-predictor\newspredictorenv\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "c:\users\chhal\documents\discord_bot\newsbased-market-predictor\newspredictorenv\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\chhal\AppData\Local\Programs\Python\Python39\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [17]:
acc_df = pd.DataFrame(means, columns=['mean','F1', 'smoothing', 'model_type', 'extract_method'])
acc_df.to_csv(os.path.join(os.path.dirname(os.getcwd()),"data/means_count.csv"))

In [None]:
# from google.colab import files
# files.download('means_count.csv')

## Saving Model

In [39]:
model_path = os.path.join(os.path.dirname(os.getcwd()),'saved_models/cvnb_model')

## saving the highest accuracy model 
max_acc = acc_df.loc[acc_df['mean'].idxmax()]
smoothing = max_acc.smoothing 
model_type = max_acc.model_type

(_, _, _, _, final_model, _) = NAIVEBAYES_CV(smooth=smoothing, model_type=model_type) 

In [48]:
final_model.write().overwrite().save(model_path)

Py4JJavaError: An error occurred while calling o7559.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:106)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1091)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1089)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1062)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1027)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1009)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1008)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:965)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:963)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1599)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1599)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1585)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1585)
	at org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:413)
	at org.apache.spark.ml.classification.NaiveBayesModel$NaiveBayesModelWriter.saveImpl(NaiveBayes.scala:571)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:168)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1218)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1423)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:334)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:404)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
	at org.apache.hadoop.mapred.FileOutputCommitter.commitJob(FileOutputCommitter.java:136)
	at org.apache.hadoop.mapred.OutputCommitter.commitJob(OutputCommitter.java:291)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
	at org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$write$3(SparkHadoopWriter.scala:100)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:642)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:100)
	... 51 more
