# GSDMM Code for Text Analysis

In [1]:
pip install git+https://github.com/rwalk/gsdmm.git

Collecting git+https://github.com/rwalk/gsdmm.git
  Cloning https://github.com/rwalk/gsdmm.git to /tmp/pip-req-build-p8smkd8g
  Running command git clone --filter=blob:none --quiet https://github.com/rwalk/gsdmm.git /tmp/pip-req-build-p8smkd8g
  Resolved https://github.com/rwalk/gsdmm.git to commit 4ad1b6b6976743681ee4976b4573463d359214ee
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import numpy as np
import pandas as pd
from gsdmm import MovieGroupProcess

from gensim.utils import simple_preprocess
import gensim, spacy,re,os,nltk,random

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])


data = pd.read_csv(r'/content/drive/MyDrive/Data/Course/Corona.csv',header = 0, encoding = 'cp437')
data = data[::100]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


def make_n_grams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    bigrams_text = [bigram_mod[doc] for doc in texts]
    trigrams_text =  [trigram_mod[bigram_mod[doc]] for doc in bigrams_text]
    return trigrams_text




def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in gensim.parsing.preprocessing.STOPWORDS] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
def top_words(mgp,cluster_word_distribution, top_cluster, values):
  Text = ''
  TheseResults = []
  for cluster in top_cluster:
    sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
    Text+= "\nCluster %s : %s"%(cluster,sort_dicts)
    TheseResults.append([cluster,sort_dicts])
  return Text,TheseResults

tokens_reviews = list(sent_to_words(data['OriginalTweet']))
tokens_reviews = make_n_grams(tokens_reviews)
reviews_lemmatized = lemmatization(tokens_reviews, allowed_postags=['NOUN', 'VERB', 'ADV'])
reviews_lemmatized = remove_stopwords(reviews_lemmatized)
print(reviews_lemmatized)
# np.random.seed(0)
Results =[]
X = 0
K = 5
for Alpha in list(np.linspace(0.05,1,5)) :  # using different values for alpha
  for Beta in list(np.linspace(0.05,1,5)) : # using different values for beta
    X+=1
    print(f'Phase Number {X}')
    mgp = MovieGroupProcess(K=K, alpha=Alpha, beta=Beta, n_iters=5)
    vocab = set(x for review in reviews_lemmatized for x in review)
    n_terms = len(vocab)
    model = mgp.fit(reviews_lemmatized, n_terms)
    doc_count = np.array(mgp.cluster_doc_count)
    top_index = doc_count.argsort()[-10:][::-1]
    Loss =0
    for i in range(K) :
      Loss += abs((1/K)-(doc_count[i]/sum(doc_count)))
    Results.append({'Parameters' : [Alpha,Beta],
                    'Loss':Loss,
                    'Doc Number':doc_count,
                    'Top Index':doc_count.argsort()[-10:][::-1],
                    'Top Words' : top_words(mgp,mgp.cluster_word_distribution, top_index, 10)[0],
                    'All Words' : top_words(mgp,mgp.cluster_word_distribution, top_index, 100)[1]})


ResultsDF = pd.DataFrame(columns = ['Parameters', 'Loss', 'Doc Number', 'Top Index', 'Top Words', 'All Words'])
for n,i in enumerate(Results) :
  ResultsDF.loc[n] = list(i.values())
ResultsDF
TotalCluster = [f'Cluster {i}' for i in range(K)]
for j in TotalCluster :
  ResultsDF[j] =ResultsDF['Top Words'].apply(lambda x : [i for i in str(x.strip('\n')).split('\n') if j in i][0].strip(f'{j} : ').strip('[]'))

for j in range(K) :
  ResultsDF[f'All Cluster {j}'] = ResultsDF['All Words'].apply(lambda x : [i[1] for i in x if i[0]==j])
# if not os.path.exists(os.path.join(MainPath,SelectedData,'01 GSDMMFiles')) :  os.makedirs(os.path.join(MainPath,SelectedData,'01 GSDMMFiles'))
ResultsDF.to_csv('1.csv')  # saving in 01 folder
Results

[[], ['hate', 'grocery', 'shopping', 'swear', 'online', 'shop', 'deal', 'swathe', 'panic', 'buyer', 'panicbuyinguk', 'moron'], ['drsanjaygupta', 'johnberman', 'newday', 'business', 'trump', 'tout', 'supply', 'online', 'order', 'supply', 'avail', 'need', 'help', 'arena', 'supply'], ['prisoner', 'release', 'view', 'coronavirus', 'prisoner', 'create', 'burn', 'prison', 'cell', 'news', 'suspect', 'covid', 'prison', 'prisoner', 'face', 'mask', 'amp', 'sell', 'extremely', 'price', 'bharat'], ['provider', 'feel', 'consumer', 'feel', 'seek', 'advice', 'attention', 'consumer', 'ihss', 'worker', 'work'], ['ære', 'community', 'ære', 'stay', 'home', 'stay'], ['thank', 'grocery_store', 'employee', 'farmer', 'trucker', 'healthcare', 'worker', 'employee', 'work', 'individual', 'offer', 'life', 'help', 'fight'], ['dtc', 'ecommerce', 'people', 'stay', 'home', 'fear', 'coronavirus', 'spread', 'amp', 'demand', 'pickup', 'delivery', 'accelerate', 'grocery', 'hit', 'sale', 'forecast', 'hit'], ['agree', 'av

[{'Parameters': [0.05, 0.05],
  'Loss': 0.11262135922330099,
  'Doc Number': array([81, 93, 66, 77, 95]),
  'Top Index': array([4, 1, 0, 3, 2]),
  'Top Words': "\nCluster 4 : [('consumer', 31), ('coronavirus', 30), ('food', 28), ('price', 25), ('business', 12), ('report', 12), ('store', 12), ('worker', 12), ('amp', 12), ('supply', 11)]\nCluster 1 : [('supermarket', 27), ('worker', 27), ('grocery_store', 17), ('food', 14), ('work', 14), ('staff', 14), ('coronavirus', 13), ('covid', 13), ('employee', 10), ('people', 9)]\nCluster 0 : [('price', 21), ('coronavirus', 21), ('covid', 13), ('people', 12), ('toiletpaper', 11), ('panic', 10), ('use', 9), ('oil', 8), ('shop', 7), ('supermarket', 7)]\nCluster 3 : [('coronavirus', 23), ('consumer', 22), ('price', 20), ('oil', 14), ('demand', 12), ('change', 12), ('food', 11), ('need', 11), ('supermarket', 10), ('covid', 9)]\nCluster 2 : [('price', 14), ('food', 14), ('coronavirus', 13), ('people', 13), ('supermarket', 13), ('stock', 8), ('covid', 8

In [5]:
ResultsDF

Unnamed: 0,Parameters,Loss,Doc Number,Top Index,Top Words,All Words,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,All Cluster 0,All Cluster 1,All Cluster 2,All Cluster 3,All Cluster 4
0,"[0.05, 0.05]",0.112621,"[81, 93, 66, 77, 95]","[4, 1, 0, 3, 2]","\nCluster 4 : [('consumer', 31), ('coronavirus...","[[4, [('consumer', 31), ('coronavirus', 30), (...","('price', 21), ('coronavirus', 21), ('covid', ...","('supermarket', 27), ('worker', 27), ('grocery...","('price', 14), ('food', 14), ('coronavirus', 1...","('coronavirus', 23), ('consumer', 22), ('price...","('consumer', 31), ('coronavirus', 30), ('food'...","[[(price, 21), (coronavirus, 21), (covid, 13),...","[[(supermarket, 27), (worker, 27), (grocery_st...","[[(price, 14), (food, 14), (coronavirus, 13), ...","[[(coronavirus, 23), (consumer, 22), (price, 2...","[[(consumer, 31), (coronavirus, 30), (food, 28..."
1,"[0.05, 0.2875]",0.151456,"[65, 113, 83, 75, 76]","[1, 2, 4, 3, 0]","\nCluster 1 : [('supermarket', 30), ('coronavi...","[[1, [('supermarket', 30), ('coronavirus', 27)...","('price', 32), ('coronavirus', 25), ('oil', 22...","('supermarket', 30), ('coronavirus', 27), ('wo...","('food', 31), ('coronavirus', 30), ('price', 2...","('consumer', 32), ('supermarket', 18), ('busin...","('price', 20), ('consumer', 14), ('covid', 13)...","[[(price, 32), (coronavirus, 25), (oil, 22), (...","[[(supermarket, 30), (coronavirus, 27), (worke...","[[(food, 31), (coronavirus, 30), (price, 27), ...","[[(consumer, 32), (supermarket, 18), (business...","[[(price, 20), (consumer, 14), (covid, 13), (w..."
2,"[0.05, 0.525]",0.491262,"[40, 35, 141, 125, 71]","[2, 3, 4, 0, 1]","\nCluster 2 : [('supermarket', 46), ('worker',...","[[2, [('supermarket', 46), ('worker', 34), ('c...","('consumer', 23), ('business', 10), ('coronavi...","('hand', 9), ('avoid', 7), ('people', 6), ('co...","('supermarket', 46), ('worker', 34), ('coronav...","('price', 48), ('food', 36), ('consumer', 35),...","('price', 29), ('coronavirus', 23), ('food', 1...","[[(consumer, 23), (business, 10), (coronavirus...","[[(hand, 9), (avoid, 7), (people, 6), (coronav...","[[(supermarket, 46), (worker, 34), (coronaviru...","[[(price, 48), (food, 36), (consumer, 35), (co...","[[(price, 29), (coronavirus, 23), (food, 15), ..."
3,"[0.05, 0.7625]",0.884466,"[195, 33, 152, 28, 4]","[0, 2, 1, 3, 4]","\nCluster 0 : [('supermarket', 51), ('coronavi...","[[0, [('supermarket', 51), ('coronavirus', 45)...","('supermarket', 51), ('coronavirus', 45), ('wo...","('consumer', 11), ('coronavirus', 6), ('travel...","('price', 63), ('consumer', 52), ('coronavirus...","('toiletpaper', 8), ('coronavirus', 7), ('avoi...","('consider', 1), ('coronaviru', 1), ('doctor',...","[[(supermarket, 51), (coronavirus, 45), (worke...","[[(consumer, 11), (coronavirus, 6), (travel, 6...","[[(price, 63), (consumer, 52), (coronavirus, 4...","[[(toiletpaper, 8), (coronavirus, 7), (avoid, ...","[[(consider, 1), (coronaviru, 1), (doctor, 1),..."
4,"[0.05, 1.0]",1.166019,"[212, 193, 0, 0, 7]","[0, 1, 4, 3, 2]","\nCluster 0 : [('price', 79), ('consumer', 64)...","[[0, [('price', 79), ('consumer', 64), ('coron...","('price', 79), ('consumer', 64), ('coronavirus...","('supermarket', 54), ('coronavirus', 38), ('wo...",,,"('turn', 2), ('sanitizer', 2), ('terf', 2), ('...","[[(price, 79), (consumer, 64), (coronavirus, 6...","[[(supermarket, 54), (coronavirus, 38), (worke...",[[]],[[]],"[[(turn, 2), (sanitizer, 2), (terf, 2), (carro..."
5,"[0.2875, 0.05]",0.134951,"[83, 63, 101, 74, 91]","[2, 4, 0, 3, 1]","\nCluster 2 : [('coronavirus', 25), ('worker',...","[[2, [('coronavirus', 25), ('worker', 24), ('c...","('price', 29), ('consumer', 26), ('coronavirus...","('coronavirus', 16), ('people', 11), ('toiletp...","('coronavirus', 25), ('worker', 24), ('consume...","('supermarket', 26), ('covid', 19), ('grocery_...","('coronavirus', 27), ('price', 27), ('food', 2...","[[(price, 29), (consumer, 26), (coronavirus, 2...","[[(coronavirus, 16), (people, 11), (toiletpape...","[[(coronavirus, 25), (worker, 24), (consumer, ...","[[(supermarket, 26), (covid, 19), (grocery_sto...","[[(coronavirus, 27), (price, 27), (food, 21), ..."
6,"[0.2875, 0.2875]",0.345631,"[72, 60, 143, 93, 44]","[2, 3, 0, 1, 4]","\nCluster 2 : [('supermarket', 43), ('coronavi...","[[2, [('supermarket', 43), ('coronavirus', 30)...","('coronavirus', 28), ('toiletpaper', 20), ('pe...","('price', 46), ('coronavirus', 22), ('oil', 22...","('supermarket', 43), ('coronavirus', 30), ('fo...","('food', 29), ('price', 23), ('consumer', 19),...","('consumer', 22), ('coronavirus', 10), ('check...","[[(coronavirus, 28), (toiletpaper, 20), (peopl...","[[(price, 46), (coronavirus, 22), (oil, 22), (...","[[(supermarket, 43), (coronavirus, 30), (food,...","[[(food, 29), (price, 23), (consumer, 19), (de...","[[(consumer, 22), (coronavirus, 10), (check, 7..."
7,"[0.2875, 0.525]",0.578641,"[179, 38, 77, 105, 13]","[0, 3, 2, 1, 4]","\nCluster 0 : [('food', 56), ('coronavirus', 3...","[[0, [('food', 56), ('coronavirus', 38), ('sup...","('food', 56), ('coronavirus', 38), ('supermark...","('worker', 11), ('supermarket', 10), ('shop', ...","('coronavirus', 19), ('toiletpaper', 15), ('su...","('consumer', 46), ('price', 41), ('coronavirus...","('price', 6), ('coronavirus', 6), ('toiletpape...","[[(food, 56), (coronavirus, 38), (supermarket,...","[[(worker, 11), (supermarket, 10), (shop, 8), ...","[[(coronavirus, 19), (toiletpaper, 15), (super...","[[(consumer, 46), (price, 41), (coronavirus, 3...","[[(price, 6), (coronavirus, 6), (toiletpaper, ..."
8,"[0.2875, 0.7625]",0.738835,"[73, 152, 18, 4, 165]","[4, 1, 0, 2, 3]","\nCluster 4 : [('coronavirus', 39), ('people',...","[[4, [('coronavirus', 39), ('people', 39), ('s...","('worker', 30), ('supermarket', 22), ('employe...","('price', 71), ('coronavirus', 45), ('consumer...","('toiletpaper', 10), ('coronavirus', 6), ('pap...","('crisis', 2), ('carrot', 2), ('deal', 2), ('l...","('coronavirus', 39), ('people', 39), ('superma...","[[(worker, 30), (supermarket, 22), (employee, ...","[[(price, 71), (coronavirus, 45), (consumer, 4...","[[(toiletpaper, 10), (coronavirus, 6), (paper,...","[[(crisis, 2), (carrot, 2), (deal, 2), (let, 1...","[[(coronavirus, 39), (people, 39), (supermarke..."
9,"[0.2875, 1.0]",1.068932,"[276, 16, 109, 6, 5]","[0, 2, 1, 3, 4]","\nCluster 0 : [('coronavirus', 65), ('food', 5...","[[0, [('coronavirus', 65), ('food', 59), ('sup...","('coronavirus', 65), ('food', 59), ('supermark...","('toiletpaper', 9), ('coronavirus', 4), ('supe...","('consumer', 46), ('price', 44), ('coronavirus...","('relief', 3), ('request', 2), ('check', 2), (...","('line', 2), ('protocol', 2), ('cpsuk', 2), ('...","[[(coronavirus, 65), (food, 59), (supermarket,...","[[(toiletpaper, 9), (coronavirus, 4), (superma...","[[(consumer, 46), (price, 44), (coronavirus, 3...","[[(relief, 3), (request, 2), (check, 2), (hand...","[[(line, 2), (protocol, 2), (cpsuk, 2), (kit, ..."
