In [20]:
import numpy as np
import re
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import gensim
import pandas as pd 
import time

In [5]:
def remove_punctuation(all_docs):
    no_punctuation_docs = []
    for doc in all_docs:
        doc = doc.replace("-", " ")
        no_punctuation_docs.append(re.sub(r'[^\w\s]', '', doc))
    return no_punctuation_docs


def remove_names(all_docs):
    no_name_docs = []
    for doc in all_docs:
        no_name_docs.append(' '.join([w for w, t in pos_tag(doc.split()) if t != 'NNP' and t != 'NNPS']))
    return no_name_docs


def tokenizer(all_docs):
    tokenized_docs = []
    for doc in all_docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs


def lemmatizer(all_docs):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_docs = []
    for doc in all_docs:
        temp = []
        for token in doc:
            if token.isalpha():
                temp.append(wordnet_lemmatizer.lemmatize(token, "v"))
        lemmatized_docs.append(temp)
    return lemmatized_docs

def untokenizer(all_docs):
    untokenized_docs = []
    for doc in all_docs:
        untokenized_docs.append(" ".join(doc))
    return untokenized_docs


def fetch_stop_words():
    stop_words = stopwords.words('english')
    return stop_words


def create_dtm(all_docs, stopwords, ngram):
    # mdf = min(len(all_docs), 5)
    vectorizer = CountVectorizer(lowercase=True, min_df=0.0, max_df=1.0, ngram_range=(1, ngram), stop_words=stopwords)
    dtm = vectorizer.fit_transform(all_docs)
    return vectorizer, dtm


def tfidf_transformer(dtm):
    tf_transformer = TfidfTransformer()
    tfidf = tf_transformer.fit_transform(dtm)
    return tf_transformer, tfidf


def generate_sentence_vector(tokens, model, vectorizer, tfidf_dense):
    vector = np.zeros(model.vector_size)
    for token in tokens:
        if token in model.wv.vocab and token in vectorizer.vocabulary_:
            vector = vector + model.wv[token] * tfidf_dense[0, vectorizer.vocabulary_[token]]
    return vector


def list_sample(list, n=5):
    p = min(len(list), n)
    for idx in range(0, p):
        print(idx, list[idx])
    return

In [23]:
verbose = True
data = pd.read_csv('TwitterData_latest.csv')
all_docs = data['text_clean']

if verbose:
    list_sample(list=all_docs)



0  Ah alright Good point! But neither does any other company. He gives them stock tho and 401k into his multibillion dollar companies. If you look at the stock market. 1 Tesla stock went up $700 this year.
1    Breaking news! It is a cool annual giveaway ! 1oooooooo DOGE 5ooo BTC will be distributed among everyone who takes part in this event. #doge #dogecoin #Ethereum #Bitcoin #ETH #BTC Join here➜.tesla-giveawayx10.c om
2 TITS EVERYONE- ELON IS MAKIMG A TITS JOKE OMG SO FUNNY! $TSLA 
3 RT : I’ve talked to a few instit PMs about $TSLA the past few days. Apparently none of the analysts are assuming global EV adop…
4 RT : Bet Hertz runs a Superbowl ad for $TSLA fleet.


In [24]:
# Removing Punctuation
no_punctuation_docs = remove_punctuation(all_docs)

if verbose:
    list_sample(list=no_punctuation_docs)

0  Ah alright Good point But neither does any other company He gives them stock tho and 401k into his multibillion dollar companies If you look at the stock market 1 Tesla stock went up 700 this year
1    Breaking news It is a cool annual giveaway  1oooooooo DOGE 5ooo BTC will be distributed among everyone who takes part in this event doge dogecoin Ethereum Bitcoin ETH BTC Join heretesla giveawayx10c om
2 TITS EVERYONE  ELON IS MAKIMG A TITS JOKE OMG SO FUNNY TSLA 
3 RT  Ive talked to a few instit PMs about TSLA the past few days Apparently none of the analysts are assuming global EV adop
4 RT  Bet Hertz runs a Superbowl ad for TSLA fleet


In [25]:
# ------------------------------------- TOKENIZE -------------------------------------------------------------------

# Tokenize each tweet
tokenized_docs = tokenizer(no_punctuation_docs)

if verbose:
    list_sample(list=tokenized_docs)

0 ['Ah', 'alright', 'Good', 'point', 'But', 'neither', 'does', 'any', 'other', 'company', 'He', 'gives', 'them', 'stock', 'tho', 'and', '401k', 'into', 'his', 'multibillion', 'dollar', 'companies', 'If', 'you', 'look', 'at', 'the', 'stock', 'market', '1', 'Tesla', 'stock', 'went', 'up', '700', 'this', 'year']
1 ['Breaking', 'news', 'It', 'is', 'a', 'cool', 'annual', 'giveaway', '1oooooooo', 'DOGE', '5ooo', 'BTC', 'will', 'be', 'distributed', 'among', 'everyone', 'who', 'takes', 'part', 'in', 'this', 'event', 'doge', 'dogecoin', 'Ethereum', 'Bitcoin', 'ETH', 'BTC', 'Join', 'heretesla', 'giveawayx10c', 'om']
2 ['TITS', 'EVERYONE', 'ELON', 'IS', 'MAKIMG', 'A', 'TITS', 'JOKE', 'OMG', 'SO', 'FUNNY', 'TSLA']
3 ['RT', 'Ive', 'talked', 'to', 'a', 'few', 'instit', 'PMs', 'about', 'TSLA', 'the', 'past', 'few', 'days', 'Apparently', 'none', 'of', 'the', 'analysts', 'are', 'assuming', 'global', 'EV', 'adop']
4 ['RT', 'Bet', 'Hertz', 'runs', 'a', 'Superbowl', 'ad', 'for', 'TSLA', 'fleet']


In [26]:
# ------------------------------------- LEMMATIZE ------------------------------------------------------------------

# lemmatize the tokens
lemmatized_docs = lemmatizer(tokenized_docs)

if verbose:
    list_sample(list=lemmatized_docs)

0 ['Ah', 'alright', 'Good', 'point', 'But', 'neither', 'do', 'any', 'other', 'company', 'He', 'give', 'them', 'stock', 'tho', 'and', 'into', 'his', 'multibillion', 'dollar', 'company', 'If', 'you', 'look', 'at', 'the', 'stock', 'market', 'Tesla', 'stock', 'go', 'up', 'this', 'year']
1 ['Breaking', 'news', 'It', 'be', 'a', 'cool', 'annual', 'giveaway', 'DOGE', 'BTC', 'will', 'be', 'distribute', 'among', 'everyone', 'who', 'take', 'part', 'in', 'this', 'event', 'doge', 'dogecoin', 'Ethereum', 'Bitcoin', 'ETH', 'BTC', 'Join', 'heretesla', 'om']
2 ['TITS', 'EVERYONE', 'ELON', 'IS', 'MAKIMG', 'A', 'TITS', 'JOKE', 'OMG', 'SO', 'FUNNY', 'TSLA']
3 ['RT', 'Ive', 'talk', 'to', 'a', 'few', 'instit', 'PMs', 'about', 'TSLA', 'the', 'past', 'few', 'days', 'Apparently', 'none', 'of', 'the', 'analysts', 'be', 'assume', 'global', 'EV', 'adop']
4 ['RT', 'Bet', 'Hertz', 'run', 'a', 'Superbowl', 'ad', 'for', 'TSLA', 'fleet']


In [27]:
# Untokenize the tokens to form sentence again
untokenized_docs = untokenizer(lemmatized_docs)

if verbose:
    list_sample(list=untokenized_docs)

0 Ah alright Good point But neither do any other company He give them stock tho and into his multibillion dollar company If you look at the stock market Tesla stock go up this year
1 Breaking news It be a cool annual giveaway DOGE BTC will be distribute among everyone who take part in this event doge dogecoin Ethereum Bitcoin ETH BTC Join heretesla om
2 TITS EVERYONE ELON IS MAKIMG A TITS JOKE OMG SO FUNNY TSLA
3 RT Ive talk to a few instit PMs about TSLA the past few days Apparently none of the analysts be assume global EV adop
4 RT Bet Hertz run a Superbowl ad for TSLA fleet


In [28]:
# ------------------------------------- STOPWORDS ------------------------------------------------------------------

# Fetch stopwords from custom list
stop_words = fetch_stop_words()
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [29]:
# ------------------------------------- VECTORIZE ------------------------------------------------------------------
# Vectorize words
vectorizer, dtm = create_dtm(untokenized_docs, stop_words, 1)
feature_names = vectorizer.get_feature_names()

if verbose:
    print(vectorizer)
    list_sample(list=feature_names)
    print(dtm.shape)

CountVectorizer(min_df=0.0,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])
0 aa
1 aaa
2 aaaaaand
3 aaaaah
4 aaaaand
(171454, 43266)


In [30]:
# ------------------------------------- TFIDF ----------------------------------------------------------------------

# tfidf transformation
tf_transformer, tfidf = tfidf_transformer(dtm)

if verbose:
    print(tf_transformer)
    print(tfidf.shape)


TfidfTransformer()
(171454, 43266)


In [31]:
# ------------------------------------- LDA SETUP ------------------------------------------------------------------

lda_docs = vectorizer.inverse_transform(dtm)

if verbose:
    list_sample(list=lda_docs)

0 ['ah' 'alright' 'company' 'dollar' 'give' 'go' 'good' 'look' 'market'
 'multibillion' 'neither' 'point' 'stock' 'tesla' 'tho' 'year']
1 ['among' 'annual' 'bitcoin' 'breaking' 'btc' 'cool' 'distribute' 'doge'
 'dogecoin' 'eth' 'ethereum' 'event' 'everyone' 'giveaway' 'heretesla'
 'join' 'news' 'om' 'part' 'take']
2 ['elon' 'everyone' 'funny' 'joke' 'makimg' 'omg' 'tits' 'tsla']
3 ['adop' 'analysts' 'apparently' 'assume' 'days' 'ev' 'global' 'instit'
 'ive' 'none' 'past' 'pms' 'rt' 'talk' 'tsla']
4 ['ad' 'bet' 'fleet' 'hertz' 'rt' 'run' 'superbowl' 'tsla']


In [32]:
dictionary = gensim.corpora.Dictionary(lda_docs)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in lda_docs]

if verbose:
    list_sample(list=doc_term_matrix)

0 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]
1 [(16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)]
2 [(28, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]
3 [(42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1)]
4 [(42, 1), (55, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1)]


In [33]:
# ------------------------------------- MODEL BUILDING -------------------------------------------------------------

start = time.time()

# LDA model building
lda = gensim.models.ldamodel.LdaModel(
                                corpus=doc_term_matrix,
                                num_topics=10,
                                id2word=dictionary
                                )
end = time.time()
print(end-start)
# Compute Coherence Score using c_v
#lda_cv = CoherenceModel(model=lda, corpus=doc_term_matrix, texts=lda_docs, dictionary=dictionary,
#                        coherence='c_v', processes=1)
#coherence_list = lda_cv.get_coherence_per_topic()
#topic_coherence = np.asarray(coherence_list)

28.130894660949707


In [34]:
lda.print_topics(num_topics=20, num_words=10)

[(0,
  '0.078*"rt" + 0.057*"market" + 0.038*"nftgiveaway" + 0.036*"tesla" + 0.032*"billion" + 0.031*"cap" + 0.031*"tsla" + 0.027*"list" + 0.023*"week" + 0.019*"global"'),
 (1,
  '0.115*"tesla" + 0.069*"rt" + 0.022*"accept" + 0.019*"doge" + 0.018*"want" + 0.017*"like" + 0.017*"get" + 0.014*"people" + 0.014*"drive" + 0.013*"love"'),
 (2,
  '0.089*"shib" + 0.059*"eth" + 0.045*"join" + 0.045*"bitcoin" + 0.043*"btc" + 0.041*"twitter" + 0.035*"tesla" + 0.033*"raffle" + 0.032*"hold" + 0.030*"holders"'),
 (3,
  '0.075*"tesla" + 0.026*"buy" + 0.025*"rt" + 0.021*"make" + 0.017*"car" + 0.013*"electric" + 0.013*"take" + 0.013*"best" + 0.013*"one" + 0.012*"get"'),
 (4,
  '0.089*"tsla" + 0.026*"rt" + 0.024*"floki" + 0.024*"low" + 0.016*"stock" + 0.012*"production" + 0.012*"miss" + 0.012*"investment" + 0.011*"ethe" + 0.010*"call"'),
 (5,
  '0.055*"rt" + 0.052*"tesla" + 0.031*"teslas" + 0.023*"step" + 0.019*"new" + 0.018*"years" + 0.017*"tsla" + 0.015*"hit" + 0.015*"gold" + 0.014*"include"'),
 (6,
  '