In [1]:
import numpy as np
import re
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import gensim
import pandas as pd 
import time

In [2]:
def remove_punctuation(all_docs):
    no_punctuation_docs = []
    for doc in all_docs:
        doc = doc.replace("-", " ")
        no_punctuation_docs.append(re.sub(r'[^\w\s]', '', doc))
    return no_punctuation_docs


#def remove_names(all_docs):
#    no_name_docs = []
#    for doc in all_docs:
#        no_name_docs.append(' '.join([w for w, t in pos_tag(doc.split()) if t != 'NNP' and t != 'NNPS']))
#    return no_name_docs


def tokenizer(all_docs):
    tokenized_docs = []
    for doc in all_docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs


def lemmatizer(all_docs):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_docs = []
    for doc in all_docs:
        temp = []
        for token in doc:
            if token.isalpha():
                temp.append(wordnet_lemmatizer.lemmatize(token, "v"))
        lemmatized_docs.append(temp)
    return lemmatized_docs

def untokenizer(all_docs):
    untokenized_docs = []
    for doc in all_docs:
        untokenized_docs.append(" ".join(doc))
    return untokenized_docs


def fetch_stop_words():
    additional_stop_words = ['rt','get','let','dont']
    stop_words = stopwords.words('english') + additional_stop_words
    return stop_words


def create_dtm(all_docs, stopwords, ngram):
    # mdf = min(len(all_docs), 5)
    vectorizer = CountVectorizer(lowercase=True, min_df=0.0, max_df=1.0, ngram_range=(1, ngram), stop_words=stopwords)
    dtm = vectorizer.fit_transform(all_docs)
    return vectorizer, dtm


#def tfidf_transformer(dtm):
#    tf_transformer = TfidfTransformer()
#    tfidf = tf_transformer.fit_transform(dtm)
#    return tf_transformer, tfidf


#def generate_sentence_vector(tokens, model, vectorizer, tfidf_dense):
#    vector = np.zeros(model.vector_size)
#    for token in tokens:
#        if token in model.wv.vocab and token in vectorizer.vocabulary_:
#            vector = vector + model.wv[token] * tfidf_dense[0, vectorizer.vocabulary_[token]]
#    return vector


def list_sample(list, n=5):
    p = min(len(list), n)
    for idx in range(0, p):
        print(idx, list[idx])
    return

In [3]:
verbose = True
data = pd.read_csv('TwitterData_latest.csv')
all_docs = data['text_clean']

if verbose:
    list_sample(list=all_docs)



0    Mostly agree. Prices are highly elastic. At $25,000 M3/MY would have ginormous, negative margin sales. Uptake on the 0% offer was likely huge, the offer had to be withdrawn. If TSLA knew how to efficiently build cars, they would be successful.
1 RT : Charting my list of potential leaders, a 🧵: Charts:  $AEHR $AFRM $AMBA $AMD $ASAN $BE* $COIN $LC $MQ* $NET…
2 RT : I almost forgot! Elon Musk’s three-year time out from being chairman of Tesla ends on Nov 7. Time flies when you’re c…
3 RT : In case you were wondering: Yes, the muxsan ccs modification does work at Tesla superchargers. The connector and cable are a…
4 Then the vulture arrived. Bawumia is eating every bit of it with loud noises all over, yet with little investments compared to the NDC administration. EDISON n TESLA story.


In [4]:
# Removing Punctuation
no_punctuation_docs = remove_punctuation(all_docs)

if verbose:
    list_sample(list=no_punctuation_docs)

0    Mostly agree Prices are highly elastic At 25000 M3MY would have ginormous negative margin sales Uptake on the 0 offer was likely huge the offer had to be withdrawn If TSLA knew how to efficiently build cars they would be successful
1 RT  Charting my list of potential leaders a  Charts  AEHR AFRM AMBA AMD ASAN BE COIN LC MQ NET
2 RT  I almost forgot Elon Musks three year time out from being chairman of Tesla ends on Nov 7 Time flies when youre c
3 RT  In case you were wondering Yes the muxsan ccs modification does work at Tesla superchargers The connector and cable are a
4 Then the vulture arrived Bawumia is eating every bit of it with loud noises all over yet with little investments compared to the NDC administration EDISON n TESLA story


In [5]:
# ------------------------------------- TOKENIZE -------------------------------------------------------------------

# Tokenize each tweet
tokenized_docs = tokenizer(no_punctuation_docs)

if verbose:
    list_sample(list=tokenized_docs)

0 ['Mostly', 'agree', 'Prices', 'are', 'highly', 'elastic', 'At', '25000', 'M3MY', 'would', 'have', 'ginormous', 'negative', 'margin', 'sales', 'Uptake', 'on', 'the', '0', 'offer', 'was', 'likely', 'huge', 'the', 'offer', 'had', 'to', 'be', 'withdrawn', 'If', 'TSLA', 'knew', 'how', 'to', 'efficiently', 'build', 'cars', 'they', 'would', 'be', 'successful']
1 ['RT', 'Charting', 'my', 'list', 'of', 'potential', 'leaders', 'a', 'Charts', 'AEHR', 'AFRM', 'AMBA', 'AMD', 'ASAN', 'BE', 'COIN', 'LC', 'MQ', 'NET']
2 ['RT', 'I', 'almost', 'forgot', 'Elon', 'Musks', 'three', 'year', 'time', 'out', 'from', 'being', 'chairman', 'of', 'Tesla', 'ends', 'on', 'Nov', '7', 'Time', 'flies', 'when', 'youre', 'c']
3 ['RT', 'In', 'case', 'you', 'were', 'wondering', 'Yes', 'the', 'muxsan', 'ccs', 'modification', 'does', 'work', 'at', 'Tesla', 'superchargers', 'The', 'connector', 'and', 'cable', 'are', 'a']
4 ['Then', 'the', 'vulture', 'arrived', 'Bawumia', 'is', 'eating', 'every', 'bit', 'of', 'it', 'with', '

In [6]:
# ------------------------------------- LEMMATIZE ------------------------------------------------------------------

# lemmatize the tokens
lemmatized_docs = lemmatizer(tokenized_docs)

if verbose:
    list_sample(list=lemmatized_docs)

0 ['Mostly', 'agree', 'Prices', 'be', 'highly', 'elastic', 'At', 'would', 'have', 'ginormous', 'negative', 'margin', 'sales', 'Uptake', 'on', 'the', 'offer', 'be', 'likely', 'huge', 'the', 'offer', 'have', 'to', 'be', 'withdraw', 'If', 'TSLA', 'know', 'how', 'to', 'efficiently', 'build', 'cars', 'they', 'would', 'be', 'successful']
1 ['RT', 'Charting', 'my', 'list', 'of', 'potential', 'leaders', 'a', 'Charts', 'AEHR', 'AFRM', 'AMBA', 'AMD', 'ASAN', 'BE', 'COIN', 'LC', 'MQ', 'NET']
2 ['RT', 'I', 'almost', 'forget', 'Elon', 'Musks', 'three', 'year', 'time', 'out', 'from', 'be', 'chairman', 'of', 'Tesla', 'end', 'on', 'Nov', 'Time', 'fly', 'when', 'youre', 'c']
3 ['RT', 'In', 'case', 'you', 'be', 'wonder', 'Yes', 'the', 'muxsan', 'ccs', 'modification', 'do', 'work', 'at', 'Tesla', 'superchargers', 'The', 'connector', 'and', 'cable', 'be', 'a']
4 ['Then', 'the', 'vulture', 'arrive', 'Bawumia', 'be', 'eat', 'every', 'bite', 'of', 'it', 'with', 'loud', 'noise', 'all', 'over', 'yet', 'with', 

In [7]:
# Untokenize the tokens to form sentence again
untokenized_docs = untokenizer(lemmatized_docs)

if verbose:
    list_sample(list=untokenized_docs)

0 Mostly agree Prices be highly elastic At would have ginormous negative margin sales Uptake on the offer be likely huge the offer have to be withdraw If TSLA know how to efficiently build cars they would be successful
1 RT Charting my list of potential leaders a Charts AEHR AFRM AMBA AMD ASAN BE COIN LC MQ NET
2 RT I almost forget Elon Musks three year time out from be chairman of Tesla end on Nov Time fly when youre c
3 RT In case you be wonder Yes the muxsan ccs modification do work at Tesla superchargers The connector and cable be a
4 Then the vulture arrive Bawumia be eat every bite of it with loud noise all over yet with little investments compare to the NDC administration EDISON n TESLA story


In [8]:
# ------------------------------------- STOPWORDS ------------------------------------------------------------------

# Fetch stopwords from custom list
stop_words = fetch_stop_words()
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
# ------------------------------------- VECTORIZE ------------------------------------------------------------------
#untokenized_docs = no_punctuation_docs 
# Vectorize words
vectorizer, dtm = create_dtm(untokenized_docs, stop_words, 1)
feature_names = vectorizer.get_feature_names()

if verbose:
    print(vectorizer)
    list_sample(list=feature_names, n=50)
    print(dtm.shape)

CountVectorizer(min_df=0.0,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])
0 aa
1 aaa
2 aaaaaaaaa
3 aaaaaaaaaaaaaaaaaa
4 aaaaaaah
5 aaaaand
6 aaaah
7 aaaand
8 aaahhh
9 aabb
10 aabout
11 aabv
12 aadvice
13 aahh
14 aaj
15 aal
16 aam
17 aamazon
18 aampe
19 aampm
20 aamzon
21 aand
22 aang
23 aap
24 aapex
25 aapl
26 aaplampamzn
27 aaplamzn
28 aapldisba
29 aaple
30 aaplo
31 aapls
32 aapple
33 aapt
34 aar
35 aare
36 aaron
37 aaronrogers
38 aary
39 aas
40 aatma
41 aattracctive
42 aattractiv
43 aav
44 aava
45 aave
46 aaww
47 aaya
48 aayengi
49 ab
(172143, 43151)


In [10]:
# ------------------------------------- TFIDF ----------------------------------------------------------------------

# tfidf transformation
#tf_transformer, tfidf = tfidf_transformer(dtm)

#if verbose:
#    print(tf_transformer)
#    print(tfidf.shape)


In [11]:
# ------------------------------------- LDA SETUP ------------------------------------------------------------------

lda_docs = vectorizer.inverse_transform(dtm)

if verbose:
    list_sample(list=lda_docs)

0 ['mostly' 'agree' 'prices' 'highly' 'elastic' 'would' 'ginormous'
 'negative' 'margin' 'sales' 'uptake' 'offer' 'likely' 'huge' 'withdraw'
 'tsla' 'know' 'efficiently' 'build' 'cars' 'successful']
1 ['charting' 'list' 'potential' 'leaders' 'charts' 'aehr' 'afrm' 'amba'
 'amd' 'asan' 'coin' 'lc' 'mq' 'net']
2 ['almost' 'forget' 'elon' 'musks' 'three' 'year' 'time' 'chairman' 'tesla'
 'end' 'nov' 'fly' 'youre']
3 ['tesla' 'case' 'wonder' 'yes' 'muxsan' 'ccs' 'modification' 'work'
 'superchargers' 'connector' 'cable']
4 ['tesla' 'vulture' 'arrive' 'bawumia' 'eat' 'every' 'bite' 'loud' 'noise'
 'yet' 'little' 'investments' 'compare' 'ndc' 'administration' 'edison'
 'story']


In [12]:
dictionary = gensim.corpora.Dictionary(lda_docs)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in lda_docs]

if verbose:
    list_sample(list=doc_term_matrix)

0 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)]
1 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]
2 [(35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1)]
3 [(43, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1)]
4 [(43, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1)]


In [13]:
# ------------------------------------- MODEL BUILDING -------------------------------------------------------------

start = time.time()

# LDA model building
lda = gensim.models.ldamodel.LdaModel(
                                corpus=doc_term_matrix,
                                num_topics=10,
                                id2word=dictionary
                                )
end = time.time()
print(end-start)
# Compute Coherence Score using c_v
#lda_cv = CoherenceModel(model=lda, corpus=doc_term_matrix, texts=lda_docs, dictionary=dictionary,
#                        coherence='c_v', processes=1)
#coherence_list = lda_cv.get_coherence_per_topic()
#topic_coherence = np.asarray(coherence_list)

27.58522081375122


In [14]:
lda.print_topics(num_topics=20, num_words=10)

[(0,
  '0.043*"tsla" + 0.024*"earn" + 0.021*"aapl" + 0.020*"eth" + 0.017*"official" + 0.017*"lcid" + 0.014*"amzn" + 0.014*"nft" + 0.014*"fb" + 0.013*"market"'),
 (1,
  '0.086*"tesla" + 0.023*"elon" + 0.023*"musk" + 0.017*"like" + 0.011*"see" + 0.011*"make" + 0.011*"happen" + 0.011*"want" + 0.010*"retweet" + 0.010*"work"'),
 (2,
  '0.081*"tesla" + 0.062*"giveaway" + 0.055*"amp" + 0.054*"doge" + 0.045*"announce" + 0.042*"winner" + 0.042*"follow" + 0.041*"check" + 0.041*"hours" + 0.031*"person"'),
 (3,
  '0.069*"tesla" + 0.031*"one" + 0.026*"cars" + 0.024*"buy" + 0.017*"call" + 0.016*"price" + 0.016*"model" + 0.016*"electric" + 0.014*"charge" + 0.013*"hertz"'),
 (4,
  '0.080*"stock" + 0.074*"tesla" + 0.058*"billion" + 0.052*"sell" + 0.042*"world" + 0.041*"twitter" + 0.036*"right" + 0.031*"musk" + 0.024*"solve" + 0.023*"elon"'),
 (5,
  '0.114*"tsla" + 0.032*"stock" + 0.021*"new" + 0.018*"tesla" + 0.018*"hit" + 0.017*"long" + 0.013*"today" + 0.013*"go" + 0.012*"value" + 0.012*"short"'),
 (6