In [38]:
import spacy
import re
import nltk
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
import gensim
import logging
import warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
import pyLDAvis
from gensim.models.phrases import Phraser


In [10]:
import numpy as np
print(np.__version__)


1.19.2


In [3]:
df = pd.read_parquet("../data/nation.parquet")

In [4]:
stopwords = nltk.corpus.stopwords.words('english')

In [5]:
def remove_emails_nl_quotes(sentence):
    if (sentence is np.nan):
        return
    sentence = re.sub('\S*@\S*\s?', '', sentence)
    sentence = re.sub('\s+', ' ', sentence)
    sentence = re.sub("\'", "", sentence)
    sentence = gensim.utils.simple_preprocess(str(sentence), deacc=True)
    return sentence



In [None]:
def remove_urls(text):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www\.[a-zA-Z0-9./]+'
    clean_text = re.sub(url_pattern, '', text)
    return clean_text

In [6]:
def word_tokenization(sentence):
    return [token for token in sentence if token not in stopwords]


In [7]:
def preprocessing(df):
    df.dropna(inplace=True)
    df['Preprocessed'] = df['Review'].apply(remove_emails_nl_quotes)
    df['No_Stopwords'] = df['Preprocessed'].apply(word_tokenization)
    data_words = df['No_Stopwords'].values
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
    bigram_model = gensim.models.phrases.Phraser(bigram)
    trigram_model = gensim.models.phrases.Phraser(trigram)

    texts = data_words
    texts = [bigram_model[doc] for doc in texts]
    texts = [trigram_model[bigram_model[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load("en_core_web_sm")
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags])
        texts_out = [[word for word in simple_preprocess(
            str(doc)) if word not in stopwords] for doc in texts_out]

    data_ready = texts_out
    return data_ready


In [None]:
def train_lda(num_topics, df):
    data = preprocessing(df)
    id2word = corpora.Dictionary(data)
    pickle.dump(id2word, open('lda_topic_modelling/bagofwords.bow', 'wb'))
    corpus = [id2word.doc2bow(text) for text in data]
    pickle.dump(corpus, open('lda_topic_modelling/corpus.corpora', 'wb'))
    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, update_every=1, chunksize=10,
                         passes=5, alpha='symmetric', iterations=100,
                         per_word_topics=True)
    pickle.dump(lda_model, open('lda_topic_modelling/lda.sav', 'wb'))
    df_topics_sents_keywords = format_topic_sentences(
        ldamodel=lda_model, corpus=corpus, texts=data)
    df_topics_sents_keywords['Review'] = df['Review'].values
    df_dominant_topic = df_topics_sents_keywords.reset_index()
    df_dominant_topic.to_parquet('../data/dominant_topic.csv')
    return df_dominant_topic

In [18]:
text_data =[i.split(" ") for i in df['paragraphs']]

In [19]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [20]:
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

In [25]:
import pyLDAvis.gensim

lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

AttributeError: 'PreparedData' object has no attribute 'display'

In [31]:
for sentence in df['paragraphs'].values:
    print(gensim.utils.simple_preprocess(str(sentence), deacc = True))

['no', 'purchases', 'pleaselog', 'in']
['daniel', 'mururu', 'is', 'accused', 'of', 'radicalisation', 'defiling', 'underage', 'girls', 'and', 'raping', 'women', 'daniel', 'mururu', 'is', 'accused', 'of', 'radicalisation', 'defiling', 'underage', 'girls', 'and', 'raping', 'women', 'the', 'senate', 'argues', 'that', 'the', 'high', 'court', 'order', 'violates', 'natural', 'justice', 'by', 'excluding', 'the', 'legislative', 'body', 'from', 'the', 'proceedings', 'pastor', 'bizarre', 'rituals', 'blood', 'sprinkling', 'forced', 'sexual', 'acts', 'on', 'the', 'pulpit', 'damaris', 'kathira', 'and', 'her', 'friends', 'planned', 'for', 'an', 'occasion', 'to', 'mark', 'new', 'chapter', 'in', 'their', 'lives', 'kenha', 'officials', 'outline', 'three', 'options', 'to', 'end', 'slaughter', 'at', 'the', 'killer', 'bridge', 'in', 'tharaka', 'nithi', 'the', 'van', 'was', 'part', 'of', 'convoy', 'carrying', 'group', 'of', 'friends', 'who', 'were', 'in', 'meru', 'for', 'parental', 'blessing', 'ceremony', '

In [32]:
tokenized_sentences = [ gensim.utils.simple_preprocess(str(sentence), deacc = True) for sentence in df['paragraphs'].values ]

In [46]:
no_stopwords = [[word for word in sentence if word.lower() not in stopwords] for sentence in tokenized_sentences]


In [47]:
bigram = gensim.models.Phrases(no_stopwords, min_count=5, threshold=100)

In [48]:
bigram_phraser = Phraser(bigram)
bigrams = [bigram_phraser[sentence] for sentence in no_stopwords]

In [49]:
no_stopwords

[['purchases', 'pleaselog'],
 ['daniel',
  'mururu',
  'accused',
  'radicalisation',
  'defiling',
  'underage',
  'girls',
  'raping',
  'women',
  'daniel',
  'mururu',
  'accused',
  'radicalisation',
  'defiling',
  'underage',
  'girls',
  'raping',
  'women',
  'senate',
  'argues',
  'high',
  'court',
  'order',
  'violates',
  'natural',
  'justice',
  'excluding',
  'legislative',
  'body',
  'proceedings',
  'pastor',
  'bizarre',
  'rituals',
  'blood',
  'sprinkling',
  'forced',
  'sexual',
  'acts',
  'pulpit',
  'damaris',
  'kathira',
  'friends',
  'planned',
  'occasion',
  'mark',
  'new',
  'chapter',
  'lives',
  'kenha',
  'officials',
  'outline',
  'three',
  'options',
  'end',
  'slaughter',
  'killer',
  'bridge',
  'tharaka',
  'nithi',
  'van',
  'part',
  'convoy',
  'carrying',
  'group',
  'friends',
  'meru',
  'parental',
  'blessing',
  'ceremony',
  'four',
  'victims',
  'succumb',
  'deep',
  'machete',
  'cuts',
  'angry',
  'villagers',
  'beat