# import libraries

In [None]:
# data handling
import pandas as pd
import numpy as np
import pyspark 
from pyspark import SparkContext

# nlp 
import spacy
from nltk.corpus import stopwords
# from nltk.tokenize import RegexpTokenizer
# from nltk.stem.wordnet import WordNetLemmatizer

# topic modeling

import gensim
import pyLDAvis.gensim_models 
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import LdaModel
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# misc
from pprint import pprint

# import warnings
# warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=UserWarning)
# warnings.filterwarnings('ignore', category=FutureWarning)

In [None]:
# pip install nltk
# pip install vaderSentiment
# pip install gensim
# pip install wordcloud
# pip install pyldavis

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

# !python -m spacy download en_core_web_sm

In [None]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [None]:
df = spark.read.json('../01_data_collection/*.json')

In [None]:
df.printSchema()

In [None]:
df.createOrReplaceTempView('df')

In [None]:
# tweet metadata
tweets = spark.sql('''
                        SELECT
                            id_str as tweet_unique_id,
                            lang as tweet_lang,
                            full_text as tweet_text
                        FROM
                            df
                        WHERE 
                            retweeted_status.id_str IS NULL
                                ''')

In [None]:
tweets_df = tweets.toPandas()

In [None]:
tweets_df.info()

In [None]:
tweets_df = tweets_df[tweets_df['tweet_lang'] == 'en']

In [None]:
sample_tweets = tweets_df.copy()
data = sample_tweets['tweet_text'].values.tolist()

In [None]:
stopwords = stopwords.words('english')
stopwords.append('http')
stopwords.append('https')
stopwords.append('co')

In [None]:
corpus = sample_tweets['tweet_text']
corpus[0]

In [None]:
# text preprocessing function using spaCy.lemma_ and spaCy.pos_
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])
    processed_texts = []
    for text in texts:
        doc = nlp(text)
        lemmatized_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                lemmatized_text.append(token.lemma_)
        final = ' '.join(lemmatized_text)
        processed_texts.append(final)
    return (processed_texts)

lemmatized_texts = lemmatization(corpus)
lemmatized_texts[0]

In [None]:
# additional text preprocessing function using gensim.utils.simple_preprocess
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return(final)

data_words = gen_words(lemmatized_texts)
data_words[0]

In [None]:
dictionary = Dictionary(data_words)

In [None]:
print('Number of unique words in initital documents:', len(dictionary))

# Filter out words that occur less than 10 documents, or more than 20% of the documents.
dictionary.filter_extremes(no_below=200, no_above=0.20)
print('Number of unique words after removing rare and common words:', len(dictionary))

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

In [None]:
num_topics = 3

lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=num_topics,
                     random_state=42,
                     update_every=1,
                     chunksize=100,
                     passes=20,
                     alpha='auto')


In [None]:
# transform processed words into str for wordcloud
strings = []

for doc in data_words:
    for token in doc:
        strings.append(token)

words = ' '.join(strings)

In [None]:
# create a wordcloud object
wordcloud = WordCloud(font_path='../05_misc/Trebuchet MS Bold.TTF', stopwords=stopwords, width=1920, height=1080, 
                      background_color="black", max_words=100, contour_width=3, 
                      colormap='Blues', random_state=42)

wordcloud.generate(words)

plt.figure(figsize= (15,10))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title("100 Most Common Twitter Words", pad = 14, weight = 'bold')
# plt.savefig('../04_data_visualizations/tweet_word_cloud.png')
plt.show();

In [None]:
pprint(lda_model.print_topics())

In [None]:
from collections import Counter
import matplotlib.colors as mcolors

topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_words for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(3, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.1); ax.set_ylim(0, 15000)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper right'); ax_twin.legend(loc='center right')

fig.tight_layout(w_pad=5)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)  
plt.savefig('../04_data_visualizations/tweet_keywords_by_topic.png', transparent=True)
plt.show();

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

In [None]:
# # text preprocessing function using WordNetLemmatizer()

# from nltk.tokenize import RegexpTokenizer
# from nltk.stem.wordnet import WordNetLemmatizer

# def docs_preprocessor(docs):
#     tokenizer = RegexpTokenizer(r'\w+')
#     for idx in range(len(docs)):
#         docs[idx] = docs[idx].lower()  # Convert to lowercase.
#         docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

#     # Remove numbers, but not words that contain numbers.
#     docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
#     # Remove words that are only one character.
#     docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
#     # Lemmatize all words in documents.
#     lemmatizer = WordNetLemmatizer()
#     docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
#     return docs

# docs = docs_preprocessor(corpus)
# docs

In [None]:
# from gensim.models import Phrases

# # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
# bigram = Phrases(docs, min_count=10)
# trigram = Phrases(bigram[docs])

# for idx in range(len(docs)):
#     for token in bigram[docs[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             docs[idx].append(token)
#     for token in trigram[docs[idx]]:
#         if '_' in token:
#             # Token is a trigram, add to document.
#             docs[idx].append(token)

In [None]:
# from gensim.corpora import Dictionary

# dictionary = Dictionary(docs)

# corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# from gensim.models import LdaModel

# # Set training parameters.
# num_topics = 4
# chunksize = 500 # size of the doc looked at every pass
# passes = 20 # number of passes through documents
# iterations = 400
# eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# # Make a index to word dictionary.
# temp = dictionary[0]  # This is only to "load" the dictionary.
# id2word = dictionary.id2token

# %time 

# model = LdaModel(corpus=corpus, id2word=id2word, 
#                  chunksize=chunksize, alpha='auto', 
#                  eta='auto', iterations=iterations, 
#                  num_topics=num_topics, passes=passes, 
#                  eval_every=eval_every)

In [None]:
# import pyLDAvis.gensim_models 
# pyLDAvis.enable_notebook()
# pyLDAvis.gensim_models.prepare(model, corpus, dictionary)