In [113]:
import re, os
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm
import pickle

from wordcloud import WordCloud

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

# spacy for lemmatization
import spacy

# Plotting tools
from matplotlib.colors import LinearSegmentedColormap, Normalize
from PIL import Image
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [15]:
home = os.getenv("HOME")
os.environ.update({'MALLET_HOME':r'/home/users/iasamori/tiktok/mallet-2.0.8/'})
mallet_path = f"{home}/tiktok/mallet-2.0.8/bin/mallet"

In [110]:
scratch = os.getenv('SCRATCH')
embedding_folder = f"{scratch}/samori/tiktok/embeddings"

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [4]:
data = pd.read_csv("../combined_comments_3.csv", encoding='utf-8')
data = data.dropna(subset=['text'])
data = data.rename(columns={'text': 'message', 'id':'message_id'})
data = data.reset_index()

threshold = 3
data['message_split'] = [i.split(' ') for i in data['message']]
lengths = [len(i) for i in data['message_split']]

drop_ids = [i for i in range(len(lengths)) if lengths[i] < threshold]
data = data.drop(drop_ids)
data = data.drop(['message_split', 'index'], axis=1)
data = data.reset_index().drop(['index'], axis=1)
data

Unnamed: 0,create_time,message_id,like_count,parent_comment_id,reply_count,message,video_id
0,1.698725e+09,7.295967e+18,0.0,7.260153e+18,0.0,many of them have no idea they are using fenta...,7.260153e+18
1,1.698725e+09,7.295968e+18,0.0,7.260153e+18,0.0,your children are spiritual warefare!,7.260153e+18
2,1.698725e+09,7.295967e+18,0.0,7.260153e+18,0.0,i pray they never havevto understand our pain,7.260153e+18
3,1.699370e+09,7.298738e+18,0.0,7.298692e+18,0.0,how ya doing,7.298692e+18
4,1.673488e+09,7.187574e+18,0.0,7.187541e+18,0.0,Thank you so much,7.187435e+18
...,...,...,...,...,...,...,...
69062,1.690226e+09,7.259465e+18,1.0,7.255254e+18,0.0,of course massive corporations are exploiting ...,7.255007e+18
69063,1.690226e+09,7.259465e+18,1.0,7.255254e+18,0.0,there is no justification for it. someone who ...,7.255007e+18
69064,1.690227e+09,7.259469e+18,1.0,7.255254e+18,0.0,I’m honestly not one to hate on anyone. I can ...,7.255007e+18
69065,1.690226e+09,7.259465e+18,1.0,7.255254e+18,0.0,in what world do you actually think it’s cool ...,7.255007e+18


In [5]:
def process(data_text):
    data_text = [re.sub('\S*@\S*\s?', '', sent) for sent in data_text] # remove emails
    data_text = [re.sub('\+?\d[\d .-]{8,}\d', '', sent) for sent in data_text] #remove phone numbers
    data_text = [re.sub('\s+', ' ', sent) for sent in data_text] # remove newlines
    data_text = [re.sub("\'", "", sent) for sent in data_text] # reomve single quotes

    tokenize_data_text = [gensim.utils.simple_preprocess(str(sent), deacc=True) for sent in data_text]

    return tokenize_data_text

In [6]:
data_text_tokenized = process(data.message)

print(len(data_text_tokenized))
print(data_text_tokenized[:5])

69067
[['many', 'of', 'them', 'have', 'no', 'idea', 'they', 'are', 'using', 'fentanyl'], ['your', 'children', 'are', 'spiritual', 'warefare'], ['pray', 'they', 'never', 'havevto', 'understand', 'our', 'pain'], ['how', 'ya', 'doing'], ['thank', 'you', 'so', 'much']]


In [7]:
bigram = gensim.models.Phrases(data_text_tokenized, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_text_tokenized], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_text_tokenized[5]]])

['happy_birthday', 'jackson', 'hope', 'you', 're', 'spending', 'it', 'visiting', 'your', 'momma', 'that', 'misses', 'you', 'more', 'and', 'more', 'every', 'day']


In [8]:
def get_ngram_lemmatize(data_text, make_bigram=True, make_trigram=False, 
                        allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    data_text = [[word for word in text if word not in stop_words] for text in data_text]
    
    if make_bigram:
        data_text = [bigram_mod[doc] for doc in tqdm(data_text, desc="Making Bigrams ...")]

    if make_bigram and make_trigram:
        data_text = [trigram_mod[bigram_mod[doc]] for doc in tqdm(data_text, desc="Making Trigrams ...")]

    data_text_lemm = []
    
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in tqdm(data_text, desc="Lemmatizing ..."):
        doc = nlp(" ".join(sent)) 
        data_text_lemm.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    return data_text_lemm

In [9]:
data_text_lemmatized = get_ngram_lemmatize(data_text_tokenized)

Making Bigrams ...: 100%|██████████| 69067/69067 [00:00<00:00, 75179.92it/s]
Lemmatizing ...: 100%|██████████| 69067/69067 [01:49<00:00, 628.03it/s]


In [10]:
print(data_text_tokenized[:6], '\n')
print(data_text_lemmatized[:6], '\n')
print(len(data_text_lemmatized))

[['many', 'of', 'them', 'have', 'no', 'idea', 'they', 'are', 'using', 'fentanyl'], ['your', 'children', 'are', 'spiritual', 'warefare'], ['pray', 'they', 'never', 'havevto', 'understand', 'our', 'pain'], ['how', 'ya', 'doing'], ['thank', 'you', 'so', 'much'], ['happy', 'birthday', 'jackson', 'hope', 'you', 're', 'spending', 'it', 'visiting', 'your', 'momma', 'that', 'misses', 'you', 'more', 'and', 'more', 'every', 'day']] 

[['many', 'idea', 'use', 'fentanyl'], ['child', 'spiritual', 'warefare'], ['pray', 'never', 'havevto', 'understand', 'pain'], [], ['thank', 'much'], ['spending', 'visit', 'momma', 'miss', 'day']] 

69067


In [11]:
ids_to_drop = [i for i in range(len(data_text_lemmatized)) if len(data_text_lemmatized[i]) == 0]
data_text_lemmatized = [i for i in data_text_lemmatized if len(i) != 0]

print(data_text_lemmatized[:6],"\n")
print(len(data_text_lemmatized))

[['many', 'idea', 'use', 'fentanyl'], ['child', 'spiritual', 'warefare'], ['pray', 'never', 'havevto', 'understand', 'pain'], ['thank', 'much'], ['spending', 'visit', 'momma', 'miss', 'day'], ['absolutely', 'agree']] 

66983


In [12]:
data_dropped = data.drop(ids_to_drop).reset_index()
data_dropped = data_dropped.drop(['index'], axis=1)
data_dropped

Unnamed: 0,create_time,message_id,like_count,parent_comment_id,reply_count,message,video_id
0,1.698725e+09,7.295967e+18,0.0,7.260153e+18,0.0,many of them have no idea they are using fenta...,7.260153e+18
1,1.698725e+09,7.295968e+18,0.0,7.260153e+18,0.0,your children are spiritual warefare!,7.260153e+18
2,1.698725e+09,7.295967e+18,0.0,7.260153e+18,0.0,i pray they never havevto understand our pain,7.260153e+18
3,1.673488e+09,7.187574e+18,0.0,7.187541e+18,0.0,Thank you so much,7.187435e+18
4,1.673466e+09,7.187482e+18,0.0,7.187435e+18,1.0,"Happy Birthday Jackson, I hope you’re spending...",7.187435e+18
...,...,...,...,...,...,...,...
66978,1.690226e+09,7.259465e+18,1.0,7.255254e+18,0.0,of course massive corporations are exploiting ...,7.255007e+18
66979,1.690226e+09,7.259465e+18,1.0,7.255254e+18,0.0,there is no justification for it. someone who ...,7.255007e+18
66980,1.690227e+09,7.259469e+18,1.0,7.255254e+18,0.0,I’m honestly not one to hate on anyone. I can ...,7.255007e+18
66981,1.690226e+09,7.259465e+18,1.0,7.255254e+18,0.0,in what world do you actually think it’s cool ...,7.255007e+18


In [13]:
# Create Dictionary
id2word = corpora.Dictionary(data_text_lemmatized)

# Create Corpus
texts = data_text_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]


print(corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1), (5, 1), (6, 1)], [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1)], [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]]


In [14]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:5]]

[[('fentanyl', 1), ('idea', 1), ('many', 1), ('use', 1)],
 [('child', 1), ('spiritual', 1), ('warefare', 1)],
 [('havevto', 1), ('never', 1), ('pain', 1), ('pray', 1), ('understand', 1)],
 [('much', 1), ('thank', 1)],
 [('day', 1), ('miss', 1), ('momma', 1), ('spending', 1), ('visit', 1)]]

In [16]:
def convertldaMalletToldaGen(mallet_model):
    model_gensim = LdaModel(
        id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha) 
    model_gensim.state.sstats[...] = mallet_model.wordtopics
    model_gensim.sync_state()
    return model_gensim

In [17]:
optimal_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=65, id2word=id2word, random_seed=43)
optimal_gensim = convertldaMalletToldaGen(optimal_model)

Mallet LDA: 65 topics, 7 topic bits, 1111111 topic mask
Data loaded.
max tokens: 88
total tokens: 276833
<10> LL/token: -11.1099
<20> LL/token: -10.69366
<30> LL/token: -10.41789
<40> LL/token: -10.1984

0	0.76923	pain day give good medication management tylenol school high life lose prescribe make hear ignore heart prayer candy clinic natural 
1	0.76923	love bad send prescription alcohol start stuff type make kill suboxone weed day understand blood leave covid great die difference 
2	0.76923	great job loss mom work amazing fast rip story sweetheart god_bless judge angel friend son complete addict hug mama buddy 
3	0.76923	people deal happen stupid energy sound change anymore phone happy live call awareness facility jail literally short bill treatment substance 
4	0.76923	addict problem drug clean street eat food post control mom fact glad create kill place feel crisis choice feed lead 
5	0.76923	addict teach save happy people life point young learn thing song remember continue congrat

In [36]:
topics = optimal_model.show_topics(num_topics=66, num_words=16861, formatted=False)
len(topics[0][1])

16861

In [114]:
with open(f"{embedding_folder}/gensim_vocab_embedding_small.pkl", "rb") as f:
    emb_dict_small = pickle.load(f)

In [116]:
list(emb_dict_small.keys())[:5]

['fentanyl', 'idea', 'many', 'use', 'child']

In [118]:
len(emb_dict_small["fentanyl"])

1536

In [97]:
filtered_topics = {}
for i in range(len(topics)):

    topic_list = topics[i][1]
    word_dict = {i[0]: i[1] for i in topic_list}
    
    threshold = 0.80
    
    words = list(word_dict.keys())
    values = list(word_dict.values())

    cumsum = np.cumsum(values)
    n = np.argmin(cumsum <= threshold)

    
    filtered_dict = dict(zip(words[:n], values[:n]))
    filtered_topics[i] = filtered_dict

In [123]:
for i in range(len(topics)):
    topic_dict_i = filtered_topics[i]

    terms = list(topic_dict_i.keys())
    values = list(topic_dict_i.values())
    values = np.array(values) / np.sum(values)

    emb = np.zeros(1536)
    # print(len(emb))
    # break
    for i in range(len(terms)):
        
        

    
    # print(values, np.sum(values))

1536


In [104]:
a = np.array(list(filtered_topics[0].values())) / np.sum(list(filtered_topics[0].values()))
a

array([0.48407452, 0.16346154, 0.0718149 , 0.03245192, 0.02824519,
       0.02524038, 0.02463942, 0.01953125, 0.01682692, 0.01592548,
       0.0141226 , 0.01352163, 0.01201923, 0.01201923, 0.0093149 ,
       0.00871394, 0.00841346, 0.00691106, 0.00691106, 0.00691106,
       0.00661058, 0.0063101 , 0.00600962])

In [105]:
np.sum(a)

1.0

In [70]:
a = np.cumsum(values)
idx = a <= 0.80

for j in range(len(idx)):
    print(j, idx[j], a[j])

0 True 0.3864236027824418
1 True 0.5169105301031423
2 True 0.5742384264811705
3 True 0.6001439194051331
4 True 0.6226912928759895
5 True 0.6428400095946271
6 True 0.6625089949628209
7 True 0.6781002638522429
8 True 0.6915327416646679
9 True 0.7042456224514273
10 True 0.7155193091868555
11 True 0.7263132645718399
12 True 0.7359078915807149
13 True 0.7455025185895899
14 True 0.7529383545214681
15 True 0.7598944591029024
16 True 0.766610698009115
17 True 0.7721276085392181
18 True 0.7776445190693213
19 True 0.7831614295994245
20 True 0.7884384744543057
21 True 0.7934756536339651
22 True 0.7982729671384026
23 False 0.8021108179419526
24 False 0.8059486687455026
25 False 0.8097865195490527
26 False 0.8133845046773808
27 False 0.816742624130487
28 False 0.8201007435835933
29 False 0.8234588630366996
30 False 0.8268169824898058
31 False 0.8294555049172465
32 False 0.8320940273446872
33 False 0.8347325497721279
34 False 0.8371312065243466
35 False 0.8392899976013435
36 False 0.8414487886783404

In [53]:
values[106], values[88]

(0.0007279786459597186, 0.0007279786459597186)

In [32]:
np.sum(list(word_dict.values())[:15])

0.6832116788321166

In [40]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1)],
 [(12, 1), (13, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
 [(19, 1), (20, 1)],
 [(21, 1)],
 [(21, 1), (22, 1)],
 [(23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(38, 1),
  (39, 1),
  (40, 1),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1)],
 [(48, 1), (49, 1), (50, 1), (51, 1)],
 [(21, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1)],
 [(35, 1), (50, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1)],
 [(65, 1), (66, 1), (67, 1), (68, 1)],
 [(69, 1), (70, 1), (71, 1)],
 [(72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1)],
 [(79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1)],
 [(9, 1),
  (78, 1),
  (89, 1),
  (90, 1),

In [None]:
from collections import Counter
from itertools import chain

def corpus_probabilities(corpus):
    word_counts = Counter(chain.from_iterable(corpus))
    total_words = sum(word_counts.values())
    return {word: count / total_words for word, count in word_counts.items()}

probs = corpus_probabilities(your_corpus)
print(f"Probability of 'example': {probs['example']}")

In [44]:
from collections import Counter
from itertools import chain

word_counts = Counter(chain.from_iterable(corpus))
len(word_counts)

18699

In [39]:
for i in range(3):
    topic_list = topics[i][1]
    word_dict = {i[0]: i[1] for i in topic_list}
    total = np.sum(list(word_dict.values())[:20])
    print(word_dict, '\n\n\n')










