In [1]:
try:
    spark.stop()
except:
    pass

# Using findspark to find automatically the spark folder
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [None]:
import string

# Remove punctuations from the sentence
def remove_punctuation(sentence):
    punctuations = list(string.punctuation)
    extra_punctuations = ['.', '``', '...', '\'s', '--', '-', 'n\'t', '_', '–']
    punctuations += extra_punctuations
    filtered = [w for w in sentence.lower() if w not in punctuations]
    return ("".join(filtered)).split()


In [None]:
from pyspark.ml.feature import HashingTF, IDF

# Calculate term frequency–inverse document frequency for reflecting importance of words in Tweet.
# :param data_rdd: input data rdd
# :return: transformed dataframe

def tf_idf(data_rdd):
    data_rdd_df = data_rdd.toDF()
    hashing_tf = HashingTF(inputCol = 'words', outputCol = 'tf_features')
    tf_data = hashing_tf.transform(data_rdd_df)

    idf_data = IDF(inputCol = 'tf_features', outputCol = 'features').fit(tf_data)
    tf_idf_data = idf_data.transform(tf_data)
    return tf_idf_data.select(['label', 'words', 'features'])


In [None]:
import numpy as np
import pandas as pd

from pyspark.ml.classification import NaiveBayes

# Apply Naive Bayes Classifier to test data for predicting sentiment of Tweets.
# :param training_df: Trained labelled data
# :param testing_df: Test data
# :return: transformed dataframe of predicted labels for tweets

def naive_bayes_classifier(training_df, testing_df):
    nb = NaiveBayes()
    model = nb.fit(training_df)

    return model.transform(testing_df).select(['label', 'words', 'prediction'])


# Calculate accuracy of model against actual data
def calculate_accuracy(result_df):
    return 1.0 * result_df.filter(result_df.label == result_df.prediction).count() / result_df.count()


# Generate Confusion Matrix for showing the performance of algorithm.
# :param result_df: Dataframe returned from the model
# :return: pandas dataframe
def confusion_matrix(result_df):
    true_positives = result_df.filter((result_df.label == 1.0) & (result_df.prediction == 1.0)).count()
    true_negatives = result_df.filter((result_df.label == 0.0) & (result_df.prediction == 0.0)).count()
    false_positives = result_df.filter((result_df.label == 0.0) & (result_df.prediction == 1.0)).count()
    false_negatives = result_df.filter((result_df.label == 1.0) & (result_df.prediction == 0.0)).count()

    # matrix = {"Positive": pd.Series([true_positives, false_positives], index=["Positive", "Negative"]),
    #           "Negative": pd.Series([false_negatives, true_negatives], index=["Positive", "Negative"])}

    # df = pd.DataFrame(matrix)
    # df.columns.name = "Actual / Predicted"

    print('true_positives', true_positives)
    print('true_negatives', true_negatives)
    print('false_positives', false_positives)
    print('false_negatives', false_negatives)

    # return df
