### Task 3

In [294]:
import pandas as pd
import numpy as np

import re
import json
import string
from datetime import datetime, timedelta

import translators as ts

# Natural language processing libraries
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Twitter library
import tweepy

#Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [295]:
import pyLDAvis 
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

import gensim
from gensim import corpora
import pickle
import bz2
import json

import warnings
warnings.filterwarnings("ignore")

import spacy

In [296]:
influential_people_tweets = pd.read_csv('influential_people_tweets.csv', delimiter=',')
influential_people_tweets.head()

Unnamed: 0.1,Unnamed: 0,id,country_code,lang,user,tweet_text_orginal,tweet_text_en,tweet_date,context_annotations
0,0,1240013952861511680,RS,sr,Response(data=<User id=356450858 name=Александ...,Поносни смо на наше пријатељство.\r\nНикада не...,we are proud of our friendship we will never f...,2020-03-17 20:35:39+00:00,[]
1,1,1239873649999523845,RS,sr,Response(data=<User id=356450858 name=Александ...,Бескрајно хвала на свему нашој кинеској браћи ...,infinitely thank you for all our chinese broth...,2020-03-17 11:18:08+00:00,"[{'domain': {'id': '123', 'name': 'Ongoing New..."
2,2,1239310408760074240,RS,sr,Response(data=<User id=356450858 name=Александ...,"Предаја није, никада није била и никада неће б...",the surrender is not it was never and will nev...,2020-03-15 22:00:01+00:00,[]
3,3,1238813645385187328,RS,sr,Response(data=<User id=356450858 name=Александ...,"Pадимо, боримо се и урадићемо све што треба. С...",we fall we fight and we will do whatever you n...,2020-03-14 13:06:03+00:00,[]
4,4,1237796648161599491,RS,sr,Response(data=<User id=356450858 name=Александ...,"Одлуке доноси струка, не политика.\r\nХвала на...",decisions make a profession not politics thank...,2020-03-11 17:44:52+00:00,[]


In [297]:
country_code = 'RS'

In [298]:
tweets_from_country = influential_people_tweets.loc[influential_people_tweets['country_code']==country_code, 
                                                        'tweet_text_en']

tweets_se = []
for row in tweets_from_country:
    tweets_se.append(str(row).split())

In [299]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = tweets_se

print(data_words[:1])

[['we', 'are', 'proud', 'of', 'our', 'friendship', 'we', 'will', 'never', 'forget', 'the', 'help', 'of', 'our', 'chinese', 'friends']]


In [300]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['we', 'are', 'proud', 'of', 'our', 'friendship', 'we', 'will', 'never', 'forget', 'the', 'help', 'of', 'our', 'chinese', 'friends']


In [301]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [302]:
# Initialize spaCy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spaCy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [303]:
data_words_bigrams = make_bigrams(data_words)

# Perform lemmatization keeping noun, adjective, verb, and adverb
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN']) #, 'ADJ', 'VERB', 'ADV'

print(data_lemmatized[:1])

[['friendship', 'help', 'friend']]


In [304]:
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1)]]


In [305]:
id2word[0]

'friend'

In [306]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('friend', 1), ('friendship', 1), ('help', 1)]]

In [307]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=5,
                                           passes=10,
                                           alpha='auto',
                                           eval_every=5, 
                                           per_word_topics=True)

In [308]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.046*"moron" + 0.046*"bass" + 0.037*"gym" + 0.026*"money" + 0.026*"luxury" + 0.022*"thank" + 0.015*"doctor" + 0.013*"device" + 0.007*"hahahaha" + 0.007*"exercise"'), (1, '0.118*"debil" + 0.033*"corona" + 0.027*"pit" + 0.027*"hoce" + 0.024*"time" + 0.015*"virus" + 0.011*"device" + 0.010*"quarantine" + 0.009*"pm" + 0.006*"exercise"'), (2, '0.045*"debille" + 0.045*"photo" + 0.029*"week" + 0.016*"praise" + 0.016*"sapcac" + 0.016*"dog" + 0.009*"lot" + 0.008*"t" + 0.008*"skolism" + 0.007*"mountain"'), (3, '0.043*"people" + 0.040*"part" + 0.031*"debt" + 0.028*"year" + 0.028*"course" + 0.024*"album" + 0.024*"truth" + 0.024*"mystery" + 0.023*"voice" + 0.022*"motivational"'), (4, '0.058*"monkey" + 0.022*"insider" + 0.022*"match" + 0.022*"head" + 0.022*"ball" + 0.022*"’s" + 0.014*"plan" + 0.012*"device" + 0.010*"occasion" + 0.010*"murder"'), (5, '0.044*"parent" + 0.032*"brother" + 0.026*"verenica" + 0.026*"pineapple" + 0.025*"child" + 0.022*"hour" + 0.015*"night" + 0.014*"apple" + 0.014*"b

In [309]:
for  index,score in sorted(lda_model[corpus[2]][0], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8771467208862305	 
Topic: 0.118*"debil" + 0.033*"corona" + 0.027*"pit" + 0.027*"hoce" + 0.024*"time" + 0.015*"virus" + 0.011*"device" + 0.010*"quarantine" + 0.009*"pm" + 0.006*"exercise"

Score: 0.048222366720438004	 
Topic: 0.043*"people" + 0.040*"part" + 0.031*"debt" + 0.028*"year" + 0.028*"course" + 0.024*"album" + 0.024*"truth" + 0.024*"mystery" + 0.023*"voice" + 0.022*"motivational"

Score: 0.019779805094003677	 
Topic: 0.046*"moron" + 0.046*"bass" + 0.037*"gym" + 0.026*"money" + 0.026*"luxury" + 0.022*"thank" + 0.015*"doctor" + 0.013*"device" + 0.007*"hahahaha" + 0.007*"exercise"

Score: 0.019081344828009605	 
Topic: 0.058*"monkey" + 0.022*"insider" + 0.022*"match" + 0.022*"head" + 0.022*"ball" + 0.022*"’s" + 0.014*"plan" + 0.012*"device" + 0.010*"occasion" + 0.010*"murder"

Score: 0.019027287140488625	 
Topic: 0.044*"parent" + 0.032*"brother" + 0.026*"verenica" + 0.026*"pineapple" + 0.025*"child" + 0.022*"hour" + 0.015*"night" + 0.014*"apple" + 0.014*"base" + 0.014*"mi

In [310]:
import pyLDAvis.gensim_models as gensimvi
from gensim.models.coherencemodel import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.446107391445409

Coherence Score:  nan


In [311]:
pyLDAvis.enable_notebook()
visualization = gensimvi.prepare(lda_model, corpus, id2word)
visualization

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
