## Building Topic Models with the Gensim Library

In [47]:
import pandas as pd
from tqdm.notebook import tqdm

import gensim

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


  and should_run_async(code)


In [48]:
train_tweets=pd.read_csv('../data/Dataset 1/clean_train_tweets.csv', encoding="utf-8")
train_tweets

  and should_run_async(code)


Unnamed: 0,id,label,tweet,length,count
0,1,0,father dysfunctional selfish drags kids dysfun...,55,7
1,2,0,thanks lyft credit use cause offer wheelchair ...,77,11
2,3,0,bihday majesty,14,2
3,4,0,model love u take u time ur,27,7
4,5,0,factsguide society motivation,29,3
...,...,...,...,...,...
31925,31958,0,ate isz youuu,13,3
31926,31959,0,see nina turner airwaves trying wrap mantle ge...,93,14
31927,31960,0,listening sad songs monday morning otw work sad,47,8
31928,31961,1,sikh temple vandalised calgary wso condemns act,47,7


In [78]:
offensive_tweets=train_tweets[train_tweets['label']==1]
offensive_tweets

  and should_run_async(code)


Unnamed: 0,id,label,tweet,length,count
13,14,1,cnn calls michigan middle school build wall ch...,54,9
14,15,1,comment australia opkillingbay seashepherd hel...,84,7
17,18,1,retweet agree,13,2
23,24,1,lumpy says prove lumpy,22,4
34,35,1,unbelievable 21st century need something like ...,65,8
...,...,...,...,...,...
31902,31935,1,lady banned kentucky mall jcpenny kentucky,42,6
31914,31947,1,omfg offended mailbox proud mailboxpride liber...,52,6
31915,31948,1,balls hashtag say weasel away lumpy tony dipshit,48,8
31916,31949,1,makes ask anybody god oh thank god,34,7


In [79]:
from nltk.tokenize import RegexpTokenizer

  and should_run_async(code)


In [80]:
docs = offensive_tweets['tweet'].tolist()

  and should_run_async(code)


In [81]:
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

  and should_run_async(code)


In [82]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

  and should_run_async(code)


In [83]:
docs[5]

  and should_run_async(code)


['let', 'fight', 'love', 'peace']

In [84]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

  and should_run_async(code)


In [85]:
bigram[docs[10]]

  and should_run_async(code)


['mocked', 'obama', 'black', 'brexit']

In [86]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

  and should_run_async(code)


In [87]:
dictionary[61]

  and should_run_async(code)


'police'

In [88]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

  and should_run_async(code)


In [89]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 141
Number of documents: 2238


  and should_run_async(code)


In [90]:
corpus[0]

  and should_run_async(code)


[(0, 1), (1, 1)]

In [91]:
from gensim.models import LsiModel

  and should_run_async(code)


In [92]:
lsi = LsiModel(corpus, num_topics=2, id2word = dictionary.id2token)

  and should_run_async(code)


In [93]:
topicid = 7

lsi.show_topic(topicid)

  and should_run_async(code)


''

## Nonnegative Matrix Factorization

In [94]:
from gensim.models import Nmf

  and should_run_async(code)


In [95]:
nmf = Nmf(corpus, num_topics=2, random_state = 321, id2word = dictionary.id2token)

  and should_run_async(code)


In [96]:
topicid = 0

nmf.show_topic(topicid)

  and should_run_async(code)


[('like', 0.06069051378760061),
 ('retweet', 0.04655538754823224),
 ('black', 0.04253660921633582),
 ('feel', 0.042497479365314644),
 ('listen', 0.04021426287157503),
 ('stomping', 0.03723573884983201),
 ('miami', 0.03492031892793887),
 ('listen_retweet', 0.027550684854103405),
 ('people', 0.026370026103240653),
 ('like_stomping', 0.025774207462392965)]

In [97]:
paperid = 3
nmf.get_document_topics(corpus[paperid])

  and should_run_async(code)


[(1, 1.0)]

In [99]:
offensive_tweets.loc[34, 'tweet']

  and should_run_async(code)


'unbelievable 21st century need something like neverump xenophobia'

## Latent Dirichlet Allocation

In [100]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 2
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
    random_state=321
)

  and should_run_async(code)


In [101]:
vis = gensimvis.prepare(model, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(vis, 'lda.html')

  and should_run_async(code)


In [74]:
#docs

  and should_run_async(code)


[['father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run'],
 ['thanks',
  'lyft',
  'credit',
  'use',
  'cause',
  'offer',
  'wheelchair',
  'van',
  'pdx',
  'disapointed',
  'getthanked'],
 ['bihday', 'majesty'],
 ['model', 'love', 'take', 'time', 'ur', 'model_love', 'take_time'],
 ['factsguide', 'society', 'motivation'],
 ['huge',
  'fan',
  'fare',
  'big',
  'talking',
  'leave',
  'chaos',
  'pay',
  'dispute',
  'get',
  'allshowandnogo'],
 ['camping', 'tomorrow', 'danny'],
 ['next',
  'school',
  'year',
  'year',
  'exam',
  'think',
  'school',
  'exam',
  'hate',
  'imagine',
  'actorslife',
  'revolutionschool',
  'girl'],
 ['love',
  'land',
  'allin',
  'cavs',
  'champion',
  'cleveland',
  'clevelandcavaliers'],
 ['welcome', 'gr8'],
 ['ireland',
  'consumer',
  'price',
  'index',
  'mom',
  'climbed',
  'previous',
  'may',
  'blog',
  'silver',
  'gold',
  'forex',
  'may_blog',
  'silver_gold'],
 ['selfish',
  'orlando',
  'standwithorlando',
  'p