In [59]:
# pip install nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

# pip install spacy
# !python -m spacy download en_core_web_sm
# pip install gensim
# pip install wordcloud

# import libraries

In [60]:
# data handling
import pandas as pd
import numpy as np
import pyspark 

# nlp preprocessing
import nltk
import spacy
import gensim

from nltk.corpus import stopwords
# from nltk.tokenize import RegexpTokenizer
# from nltk.stem.wordnet import WordNetLemmatizer

from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess

# topic modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:


import pyLDAvis.gensim_models 
from gensim.models import LdaModel

# misc
from pprint import pprint

# import warnings
# warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=UserWarning)
# warnings.filterwarnings('ignore', category=FutureWarning)

In [61]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [62]:
df = spark.read.json('../01_data_collection/tweets0.json')

In [63]:
df.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- id_str: string (nullable = true)
 |    |    |    |

In [64]:
df.createOrReplaceTempView('df')

In [9]:
# tweet metadata
tweets = spark.sql('''
                        SELECT
                            id_str as tweet_unique_id,
                            lang as tweet_lang,
                            full_text as tweet_text
                        FROM
                            df
                        WHERE 
                            retweeted_status.id_str IS NULL
                                ''')

In [10]:
tweets.show(5)

+-------------------+----------+--------------------+
|    tweet_unique_id|tweet_lang|          tweet_text|
+-------------------+----------+--------------------+
|1577132402400444417|        en|With risks from c...|
|1577132386592075776|        en|But climate chang...|
|1577132344271437824|        en|@WoodFootMagoo87 ...|
|1577132337519001603|        en|seen a few people...|
|1577132336113934336|        en|@priscian @robhon...|
+-------------------+----------+--------------------+
only showing top 5 rows



In [11]:
tweets.count()

2379

In [12]:
tweets_df = tweets.toPandas()

In [13]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2379 entries, 0 to 2378
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet_unique_id  2379 non-null   object
 1   tweet_lang       2379 non-null   object
 2   tweet_text       2379 non-null   object
dtypes: object(3)
memory usage: 55.9+ KB


In [14]:
tweets_df = tweets_df[tweets_df['tweet_lang'] == 'en']

In [15]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2289 entries, 0 to 2378
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet_unique_id  2289 non-null   object
 1   tweet_lang       2289 non-null   object
 2   tweet_text       2289 non-null   object
dtypes: object(3)
memory usage: 71.5+ KB


In [16]:
sample_tweets = tweets_df.copy()
data = sample_tweets['tweet_text'].values.tolist()

In [17]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [18]:
stop_words.add('http')
stop_words.add('https')
stop_words.add('co')
stop_words.add('com')

In [19]:
corpus = sample_tweets['tweet_text']
corpus[0]

"With risks from climate change accelerating, communities need to improve their resiliency. @LamarUniversity's Center for Resiliency made them a great partner on our Texas Gulf Coast urban integrated field lab with @UTAustin, @TAMU, @PVAMU &amp; @ORNL: https://t.co/c1lgwqt2y6 https://t.co/LtYLEzL2DT"

In [20]:
# text preprocessing function using spaCy.lemma_ and spaCy.pos_
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])
    processed_texts = []
    for text in texts:
        doc = nlp(text)
        lemmatized_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                lemmatized_text.append(token.lemma_)
        final = ' '.join(lemmatized_text)
        processed_texts.append(final)
    return (processed_texts)

lemmatized_texts = lemmatization(corpus)
lemmatized_texts[0]

'risk climate change accelerate community need improve resiliency @lamaruniversity make great partner urban integrate field lab @ORNL https://t.co/c1lgwqt2y6 https://t.co/ltylezl2dt'

In [21]:
# additional text preprocessing function using gensim.utils.simple_preprocess
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return(final)

data_words = gen_words(lemmatized_texts)
data_words[0]

['risk',
 'climate',
 'change',
 'accelerate',
 'community',
 'need',
 'improve',
 'resiliency',
 'lamaruniversity',
 'make',
 'great',
 'partner',
 'urban',
 'integrate',
 'field',
 'lab',
 'ornl',
 'https',
 'co',
 'lgwqt',
 'https',
 'co',
 'ltylezl',
 'dt']

In [22]:
dictionary = Dictionary(data_words)

In [23]:
print('Number of documents:', len(corpus))
print('Number of unique words in initital documents:', len(dictionary))
# Filter out words that occur less than 10 documents, or more than 20% of the documents.
dictionary.filter_extremes(no_below=200, no_above=0.20)
print('Number of unique words after removing rare and common words:', len(dictionary))

Number of documents: 2289
Number of unique words in initital documents: 6449
Number of unique words after removing rare and common words: 4


In [None]:
# num_topics = 3

# lda_model = LdaModel(corpus=corpus,
#                      id2word=id2word,
#                      num_topics=num_topics,
#                      random_state=42,
#                      update_every=1,
#                      chunksize=100,
#                      passes=20,
#                      alpha='auto')


In [24]:
# transform processed words into str for wordcloud
strings = []

for doc in data_words:
    for token in doc:
        strings.append(token)

words = ' '.join(strings)

In [56]:
# create a wordcloud object
wordcloud = WordCloud(font_path='../05_misc/Trebuchet MS Bold.TTF', 
                      stopwords=stop_words, 
                      width=1920, height=1080, 
                      max_words=100, colormap='Blues')

wordcloud.generate(words)

plt.figure(figsize= (15,10))
plt.imshow(wordcloud)
plt.axis('off')
# plt.savefig('../04_data_visualizations/tweet_word_cloud.png')
plt.show();

In [None]:
# pprint(lda_model.print_topics())

In [None]:
# from collections import Counter
# import matplotlib.colors as mcolors

# topics = lda_model.show_topics(formatted=False)
# data_flat = [w for w_list in data_words for w in w_list]
# counter = Counter(data_flat)

# out = []
# for i, topic in topics:
#     for word, weight in topic:
#         out.append([word, i , weight, counter[word]])

# df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# # Plot Word Count and Weights of Topic Keywords
# fig, axes = plt.subplots(3, figsize=(16,10), sharey=True, dpi=160)
# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
# for i, ax in enumerate(axes.flatten()):
#     ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
#     ax_twin = ax.twinx()
#     ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
#     ax.set_ylabel('Word Count', color=cols[i])
#     ax_twin.set_ylim(0, 0.1); ax.set_ylim(0, 15000)
#     ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
#     ax.tick_params(axis='y', left=False)
#     ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
#     ax.legend(loc='upper right'); ax_twin.legend(loc='center right')

# fig.tight_layout(w_pad=5)    
# fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)  
# plt.savefig('../04_data_visualizations/tweet_keywords_by_topic.png', transparent=True)
# plt.show();

In [None]:
# pyLDAvis.enable_notebook()
# pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

In [None]:
# # text preprocessing function using WordNetLemmatizer()

# from nltk.tokenize import RegexpTokenizer
# from nltk.stem.wordnet import WordNetLemmatizer

# def docs_preprocessor(docs):
#     tokenizer = RegexpTokenizer(r'\w+')
#     for idx in range(len(docs)):
#         docs[idx] = docs[idx].lower()  # Convert to lowercase.
#         docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

#     # Remove numbers, but not words that contain numbers.
#     docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
#     # Remove words that are only one character.
#     docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
#     # Lemmatize all words in documents.
#     lemmatizer = WordNetLemmatizer()
#     docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
#     return docs

# docs = docs_preprocessor(corpus)
# docs

In [None]:
# from gensim.models import Phrases

# # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
# bigram = Phrases(docs, min_count=10)
# trigram = Phrases(bigram[docs])

# for idx in range(len(docs)):
#     for token in bigram[docs[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             docs[idx].append(token)
#     for token in trigram[docs[idx]]:
#         if '_' in token:
#             # Token is a trigram, add to document.
#             docs[idx].append(token)

In [None]:
# from gensim.corpora import Dictionary

# dictionary = Dictionary(docs)

# corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# from gensim.models import LdaModel

# # Set training parameters.
# num_topics = 4
# chunksize = 500 # size of the doc looked at every pass
# passes = 20 # number of passes through documents
# iterations = 400
# eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# # Make a index to word dictionary.
# temp = dictionary[0]  # This is only to "load" the dictionary.
# id2word = dictionary.id2token

# %time 

# model = LdaModel(corpus=corpus, id2word=id2word, 
#                  chunksize=chunksize, alpha='auto', 
#                  eta='auto', iterations=iterations, 
#                  num_topics=num_topics, passes=passes, 
#                  eval_every=eval_every)

In [None]:
# import pyLDAvis.gensim_models 
# pyLDAvis.enable_notebook()
# pyLDAvis.gensim_models.prepare(model, corpus, dictionary)