In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.master('local[*]').appName('F1WordCount').getOrCreate()
sc = spark.sparkContext

In [None]:
articles_rdd = sc.textFile(r'D:\Downloads\HW2_ResearchSeminar\Articles')

In [None]:
articles_rdd.count()

### Fix and transform the data by removing punctuation and to lowercase

In [None]:
def clean_lower_str(x):
    punctuation = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-'
    lowercased_str = x.lower()
    lowercased_str = lowercased_str.replace('�', ' ')
    for ch in punctuation:
        lowercased_str = lowercased_str.replace(ch, '')
    return lowercased_str

In [None]:
articles_rdd = articles_rdd.map(clean_lower_str)

In [None]:
articles_rdd.take(10)

### Splitting sentences into list of words and whitespaces being excluded

In [None]:
articles_rdd_words = articles_rdd.flatMap(lambda sent: sent.split(' '))\
                           .filter(lambda w: w != '')\
                           .filter(lambda w: w.isalpha() == True)

In [None]:
articles_rdd_words.take(10)

### Excluding stopwords

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords =stopwords.words('english')

In [None]:
articles_rdd_words = articles_rdd_words.filter(lambda w: w not in stopwords)

### Counting how many times each word occurs in all articles

In [None]:
articles_word_count = articles_rdd_words.map(lambda w: (w, 1))

In [None]:
articles_word_count_rbk = articles_word_count.reduceByKey(lambda x, y: (x+y))\
                                   .map(lambda x:(x[1],x[0]))\
                                   .sortByKey(False)

In [None]:
articles_word_count_rbk.take(10)

In [None]:
count_df = spark.createDataFrame(articles_word_count_rbk).toDF('count', 'word')
count_df.show(10)

## Visualizations of data

In [None]:
count_df.toPandas()[:10].sort_values(by = 'count').plot.barh(x = 'word', y = 'count', color = 'blue', figsize=(8,5), legend = False, title = 'Most frequent words in articles')

In [None]:
count_df.toPandas()[:20].sort_values(by = 'count').plot.barh(x = 'word', y = 'count', color = 'red', figsize=(8,5), legend = False, title = 'Top-20 frequent words')

### Word Cloud

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import pandas as pd
import numpy as np

In [None]:
frequencies = count_df.toPandas().set_index('word').T.to_dict('records')

In [None]:
wordcloud = WordCloud(background_color = 'white', max_words = 200)

wordcloud.generate_from_frequencies(dict(*frequencies))

plt.figure(figsize = (10, 8))
plt.imshow(wordcloud)