#  Q1.Part-2 : TRAINING DATASET VOCABULARY EXPLORATION

In [None]:
# CREATING THE SPARK SESSION
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,IntegerType, FloatType
from pyspark.sql.functions import concat,lit
import csv
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pyspark.sql.functions as F
from pyspark.sql.functions import avg, round


# INITIATE SPARK SESSION
spark = SparkSession \
    .builder \
    .appName("Python Spark Q1 Part2") \
    .getOrCreate()

# CREATING DATA PATH
training_path = "s3://comp5349-snaz3253/Assignment_Data/train.tsv"

# stopwords list
stop_words = list(stopwords.words('english')) 


# CREATING DATAFRAMES
train_df = spark.read.csv(training_path,sep='\t',header=True,inferSchema="true")

# EXTRACTING USEFUL COUMNS AND CONCATENATING THE SENTENCES
train_df = train_df.select(['genre',(concat(train_df.sentence1, lit(" "), train_df.sentence2).alias('joined'))])

# CONVERT TO RDDS FOR TOKENIZATION
train_rdd = train_df.rdd.map(list)


#creating mapper functions

def removePunctuationsFunct(record):
    list_punct=list(string.punctuation)
    x = str(record[1])
    for punct in list_punct:
        if punct in x:
            x = x.replace(punct, '')
    return (record[0],x)

def tokens_with_stopwords(record):
    words = word_tokenize(record[1])
    new_words= [word.lower() for word in words if word.isalpha()]
    return (record[0], new_words)


def tokens_without_stopwords(record):
    words = word_tokenize(record[1])
    new_words= [word.lower() for word in words if word.isalpha()]
    filtered_words = [w for w in new_words if not w in stop_words] 
    return (record[0], filtered_words)


#remove punctuation from sentences
new_train_rdd = train_rdd.map(removePunctuationsFunct)

#extract tokens
train_tokenized_with_stopwords = new_train_rdd.map(tokens_with_stopwords)
train_tokenized_without_stopwords = new_train_rdd.map(tokens_without_stopwords)

#CONVERT BACK TO DF
df_with_stopwords = spark.createDataFrame(train_tokenized_with_stopwords)
df_without_stopwords = spark.createDataFrame(train_tokenized_without_stopwords)


#rename columns
df_with_stopwords = df_with_stopwords.withColumnRenamed('_1','genre')
df_with_stopwords = df_with_stopwords.withColumnRenamed('_2','BOW')

df_without_stopwords = df_without_stopwords.withColumnRenamed('_1','genre')
df_without_stopwords = df_without_stopwords.withColumnRenamed('_2','BOW')


# exploding the dataframe to form separate rows for each word
from pyspark.sql.functions import split, explode,col
df_exploded_with_stopwords = df_with_stopwords.withColumn('BOW',explode('BOW'))
df_exploded_without_stopwords = df_without_stopwords.withColumn('BOW',explode('BOW'))


from pyspark.sql.functions import col, countDistinct
result = df_exploded_with_stopwords.groupBy("BOW").agg(countDistinct("genre").alias("Common_Genre_Count"))
result_without_stopwords = df_exploded_without_stopwords.groupBy("BOW").agg(countDistinct("genre").alias("Common_Genre_Count"))

result.cache()
result_without_stopwords.cache()


# GENERATE AND DISPLAY THE STATISTICS

# Counting the total number of unique words existing in the training data corpus
total_words_with_stopwords = result.count()
total_words_without_stopwords = result_without_stopwords.count()


# dropping the words column (since we only require the number of words existing in genre combinations, not the words)
# generate the count of words for each genre combination and rename the column
x = result.drop('BOW').groupBy('Common_Genre_Count').count().withColumnRenamed('count','Number_of_Words')
y = result_without_stopwords.drop('BOW').groupBy('Common_Genre_Count').count().withColumnRenamed('count','Number_of_Words')

# generate new column that holds percentages of the words existing in various genre combinations
final_df_with_stopwords = x.withColumn("Percentages", round(F.col("Number_of_Words")/total_words_with_stopwords*100, 2))
final_df_without_stopwords = y.withColumn("Percentages", round(F.col("Number_of_Words")/total_words_without_stopwords*100, 2))


# print out the required results for first requirement
print('REQUIREMENT 1: WITHOUT REMOVING STOP WORDS\n\n')
print('The number of unique words in the training data corpus are: ')
print(total_words_with_stopwords)
print('\nThe number of words existing in various genre combinations with their percentages are displayed as under:\n')
final_df_with_stopwords.sort('Common_Genre_Count').show()
print('***********************************************************************************************************')


# print out the required results for second requirement
print('REQUIREMENT 2: AFTER REMOVING STOP WORDS\n\n')
print('The number of unique words in the training data corpus are: ')
print(total_words_without_stopwords)
print('\nThe number of words existing in various genre combinations with their percentages are displayed as under:\n')
final_df_without_stopwords.sort('Common_Genre_Count').show()