#  COMP4560 - Artefact
## Implementation of Topic model


Bokun Kong, u6342099

Supervisor: Dr. Dongwoo Kim

This jupyter notebook file is created by Bokun Kong.

Dataset of Russian troll: https://github.com/fivethirtyeight/russian-troll-tweets/


In [1]:
# Import libraries
import numpy as np
import pandas as pd
import glob
import string

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora

  (fname, cnt))
  (fname, cnt))


In [2]:
# Reading files and extract useful information
allfiles = glob.glob('training dataset/IRAhandle_tweets_*.csv')
allfiles.sort()

data = pd.concat([pd.read_csv(f) for f in allfiles], ignore_index = True)
print('Number of rows and columns of the whole dataset: {}'.format(data.shape))


df_en = data[(data['language'] == 'English') & (data['account_category'] != 'NonEnglish') & (~data['content'].isnull())]

df_lnr = df_en[((df_en['account_category'] == 'LeftTroll') | (df_en['account_category'] == 'RightTroll'))]

df_lnr = df_lnr.reset_index(drop=True)

# Creat a new dataset containing useful tweets
df_lnr.to_csv("left_right_news.csv", index=False)

  if (yield from self.run_code(code, result)):
  if (yield from self.run_code(code, result)):


Number of rows and columns of the whole dataset: (2435342, 21)


In [3]:
# Read the generated dataset
df_tweets = pd.read_csv("left_right_news.csv", parse_dates=['publish_date'])
df_tweets = df_tweets.drop(['external_author_id', 'author', 'region', 'language', 'harvested_date', 'following', 'followers', 'updates', 'post_type', 'account_type', 'retweet', 'new_june_2018', 'alt_external_id', 'article_url', 'tco1_step1', 'tco2_step1', 'tco3_step1', 'tweet_id'], axis=1)
df_tweets['index'] = df_tweets.index
df_tweets.shape

  interactivity=interactivity, compiler=compiler, result=result)


(984045, 4)

In [5]:
# Data pre-processing
punctuation = set(string.punctuation)
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def preprocessor(text):
    tokens = WordPunctTokenizer().tokenize(text.lower())
    stems = []
    for token in tokens:
        if token.isalpha() and token not in gensim.parsing.preprocessing.STOPWORDS and token not in punctuation and len(token) >= 3 and len(token) <= 14:
            stems.append(lemmatizer.lemmatize(token, pos='v'))
   
    return stems

docs = df_tweets['content'].map(preprocessor)

In [65]:
# Creating dictionary for corpus
dictionary = corpora.Dictionary(docs)
dictionary.filter_n_most_frequent(40)
dictionary.filter_extremes(no_below=6, keep_n=100000)

print(dictionary)
# Generate Document-Term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs]

Dictionary(40112 unique tokens: ['barely', 'corruption', 'democrat', 'hear', 'mainstream']...)


In [66]:
# Implementing TF-IDF
from gensim import corpora, models

tfidf = models.TfidfModel(doc_term_matrix)
corpus_tfidf = tfidf[doc_term_matrix]

# Print
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.36661796692922144),
 (1, 0.2687982153569831),
 (2, 0.24648225277166924),
 (3, 0.22712687135389736),
 (4, 0.3034109454103004),
 (5, 0.45942562653400876),
 (6, 0.37300551049882885),
 (7, 0.2660573236666947),
 (8, 0.2698457552997815),
 (9, 0.30843511314335414)]


In [61]:
# LDA using Bag of Words
# Generate model
Lda = gensim.models.ldamodel.LdaModel
lda_model = gensim.models.LdaMulticore(doc_term_matrix, num_topics=20, id2word=dictionary, iterations=50, decay=0.5, offset=1.0, gamma_threshold=0.001, passes=1)

In [62]:
for idx, topic in lda_model.print_topics(num_words=10):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.032*"tcot" + 0.020*"pjnet" + 0.014*"ccot" + 0.010*"wakeupamerica" + 0.010*"refugees" + 0.008*"islamkills" + 0.007*"love" + 0.007*"american" + 0.007*"come" + 0.006*"support"
Topic: 1 
Words: 0.016*"nowplaying" + 0.014*"music" + 0.011*"play" + 0.010*"spend" + 0.009*"million" + 0.009*"water" + 0.009*"best" + 0.007*"soundcloud" + 0.006*"action" + 0.006*"shit"
Topic: 2 
Words: 0.022*"great" + 0.011*"anti" + 0.010*"gun" + 0.008*"demndebate" + 0.007*"control" + 0.007*"demdebate" + 0.006*"attack" + 0.006*"blame" + 0.005*"fight" + 0.005*"muslim"
Topic: 3 
Words: 0.018*"love" + 0.011*"live" + 0.008*"check" + 0.007*"life" + 0.007*"sell" + 0.005*"god" + 0.005*"state" + 0.005*"girls" + 0.005*"business" + 0.005*"food"
Topic: 4 
Words: 0.012*"mind" + 0.010*"damn" + 0.009*"potus" + 0.009*"lose" + 0.008*"build" + 0.007*"way" + 0.007*"day" + 0.006*"heart" + 0.006*"fear" + 0.005*"power"
Topic: 5 
Words: 0.011*"free" + 0.009*"racist" + 0.008*"speech" + 0.007*"school" + 0.007*"illegal" +

In [67]:
# LDA using TF-IDF
Lda = gensim.models.ldamodel.LdaModel
lda_model_tfidf = gensim.models.LdaMulticore(doc_term_matrix, num_topics=20, id2word=dictionary, iterations=100, decay=0.7,  gamma_threshold=0.0005, passes=5)

In [68]:
for idx, topic in lda_model_tfidf.print_topics(num_words=10):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"foxnews" + 0.009*"christmas" + 0.008*"best" + 0.007*"spend" + 0.007*"meet" + 0.007*"sessions" + 0.007*"open" + 0.006*"king" + 0.006*"guess" + 0.006*"start"
Topic: 1 
Words: 0.010*"nfl" + 0.010*"order" + 0.009*"work" + 0.009*"texas" + 0.008*"happen" + 0.007*"vegas" + 0.007*"sign" + 0.006*"wait" + 0.006*"good" + 0.006*"team"
Topic: 2 
Words: 0.013*"matter" + 0.012*"true" + 0.009*"soros" + 0.009*"anti" + 0.009*"judge" + 0.008*"stand" + 0.008*"country" + 0.007*"justice" + 0.006*"train" + 0.006*"blacktwitter"
Topic: 3 
Words: 0.011*"refugees" + 0.010*"islamkills" + 0.008*"islam" + 0.007*"demndebate" + 0.007*"demdebate" + 0.007*"muslim" + 0.007*"woman" + 0.007*"syria" + 0.007*"muslims" + 0.007*"terrorists"
Topic: 4 
Words: 0.052*"cnn" + 0.019*"fakenews" + 0.008*"attack" + 0.007*"democrats" + 0.006*"hurt" + 0.005*"officials" + 0.005*"worst" + 0.005*"stand" + 0.005*"leave" + 0.005*"enjoy"
Topic: 5 
Words: 0.019*"shoot" + 0.018*"year" + 0.018*"kill" + 0.015*"old" + 0.015

In [48]:
# another example of topic result using tf-idf
for idx, topic in lda_model_tfidf.print_topics(num_words=10):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.022*"pjnet" + 0.017*"job" + 0.015*"great" + 0.012*"ccot" + 0.012*"retweet" + 0.011*"illegal" + 0.010*"things" + 0.009*"trumptrain" + 0.007*"damn" + 0.007*"strong"
Topic: 1 
Words: 0.018*"korea" + 0.018*"north" + 0.015*"isis" + 0.014*"wrong" + 0.013*"target" + 0.012*"islam" + 0.011*"muslims" + 0.009*"law" + 0.008*"water" + 0.006*"perfect"
Topic: 2 
Words: 0.018*"mar" + 0.010*"claim" + 0.010*"bernie" + 0.008*"sanders" + 0.007*"drug" + 0.007*"worst" + 0.007*"border" + 0.006*"security" + 0.006*"terrorism" + 0.005*"candidate"
Topic: 3 
Words: 0.011*"years" + 0.007*"dog" + 0.007*"rice" + 0.006*"secret" + 0.006*"sexual" + 0.005*"imo" + 0.005*"susan" + 0.005*"assault" + 0.005*"worry" + 0.005*"ago"
Topic: 4 
Words: 0.012*"life" + 0.009*"agree" + 0.008*"dems" + 0.007*"jeff" + 0.006*"neverhillary" + 0.006*"sessions" + 0.006*"hate" + 0.006*"blame" + 0.006*"voters" + 0.005*"travel"
Topic: 5 
Words: 0.019*"music" + 0.012*"listen" + 0.011*"play" + 0.010*"supporters" + 0.009*"wow" +

In [52]:
# another example of topic result using tf-idf
for idx, topic in lda_model_tfidf.print_topics(num_words=10):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 25 
Words: 0.027*"government" + 0.022*"game" + 0.022*"million" + 0.019*"freedom" + 0.013*"play" + 0.011*"number" + 0.009*"hat" + 0.008*"problem" + 0.008*"constitution" + 0.008*"enlist"
Topic: 13 
Words: 0.024*"antifa" + 0.013*"forget" + 0.009*"comey" + 0.009*"jam" + 0.009*"blm" + 0.008*"respect" + 0.007*"intelligence" + 0.006*"attack" + 0.006*"violence" + 0.006*"house"
Topic: 38 
Words: 0.018*"hit" + 0.014*"season" + 0.013*"phone" + 0.013*"deep" + 0.010*"wanna" + 0.009*"veteran" + 0.008*"threaten" + 0.008*"chief" + 0.007*"secretary" + 0.007*"marine"
Topic: 33 
Words: 0.021*"voters" + 0.014*"imo" + 0.013*"base" + 0.012*"brother" + 0.009*"establishment" + 0.009*"admin" + 0.008*"carolina" + 0.008*"reject" + 0.007*"coach" + 0.007*"policies"
Topic: 6 
Words: 0.023*"country" + 0.020*"retweet" + 0.016*"sport" + 0.016*"best" + 0.014*"music" + 0.014*"beautiful" + 0.013*"soundcloud" + 0.013*"rock" + 0.011*"click" + 0.011*"play"
Topic: 12 
Words: 0.026*"illegal" + 0.016*"west" + 0.011*"kae