In [1]:
import pandas as pd
import numpy as np

#gensim
#pip install --upgrade gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models import ldamodel

#nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')

# Plotting tools
#pip install pyldavis
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from sklearn.utils import resample

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/janestout/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv('data/df_suicide_upsampled_Train.csv', low_memory=False)

df.suicide.value_counts()

False    34708
True     34707
Name: suicide, dtype: int64

In [3]:
# df_majority = df[df.suicide==0]
# df_minority = df[df.suicide==1]
# #
# df_majority_downsampled = resample(df_majority, replace=False, n_samples=6633, random_state=123)
# df_downsampled = pd.concat([df_majority_downsampled, df_minority])
# df=df_downsampled.copy()

# df.suicide.value_counts()

In [4]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 200
df['motive'].replace(to_replace=['Unknown', 'The specific motive for the attack is unknown.'],value=np.NaN, inplace=True)
data = df[['motive']]
data.dropna(inplace = True)
data_text = data[['motive']]
# data_text['index'] = data_text.index
documents = data_text
# print(len(documents))
print(documents[:70])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                motive
8    Attack on police camp                                                                                                                                                                                                                                                                                                                                                                                                                       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [5]:
my_stopwords = {'motive', 'attack', 'Unknown', 'unknown', 'however', 'sources', 'specific', 'stated', 'statement', 'States', 'state', 'target', 'speculate', 'incident', 'targeted', 'targeting', 'speculated', 'suicide', 'bomb', 'bombing', 'bomber', 'responsibility', 'claim', 'claimed', 'noted', 'State', 'carried', 'majority', 'minority'}
#consider making a single set

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and token not in my_stopwords and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
stemmer = SnowballStemmer('english')
processed_docs = documents['motive'].map(preprocess)

In [7]:
processed_docs[:70]

8      [polic, camp]                                                                                                                                                                                                                                                                                                                               
10     [tehrik, taliban, pakistan, lashkar, jhangvi, separ, retali, death, malik, ishaq, leader, lashkar, jhangvi, addit, lashkar, islam, pakistan, reveng, militari, oper, group]                                                                                                                                                                 
20     [destroy, lankan, navi, gunpoint]                                                                                                                                                                                                                                                                                        

In [8]:
### Bag of words

In [9]:
#create corpus
texts = [[''.join(item) for item in document] for document in processed_docs]
# texts = processed_docs

#create dictionary
dictionary = corpora.Dictionary(texts)
# id2word = gensim.corpora.Dictionary(processed_docs)

#Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in texts]

# bow_corpus = [id2word.doc2bow(text) for text in texts] 
corpus

[[(0, 1), (1, 1)],
 [(2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 3),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1)],
 [(19, 1), (20, 1), (21, 1), (22, 1)],
 [(15, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)],
 [(11, 1),
  (13, 1),
  (17, 1),
  (18, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(3, 1),
  (4, 1),
  (13, 1),
  (14, 1),
  (17, 1),
  (18, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1)],
 [(42, 1)],
 [(43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1)],
 [(34, 1), (35, 1), (36, 1), (37, 1), (50, 1), (51, 1)],
 [(52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)],
 [(59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1)],
 [(1, 1), (54, 1), (59, 1), (66, 1), (67, 1), (68, 1)],
 [(24, 1), (26, 1), (27, 1), (69, 1), (70, 1), (71, 1)],
 [(6, 1),
  (9, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),

In [10]:
model = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)
# lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=2, id2word=id2word, passes=2, workers=2)

#get docs and topics matrix: should be same # rows as there are motive data: 
# https://stackoverflow.com/questions/43357247/get-document-topics-and-get-term-topics-in-gensim

# tune the model by adjusting alpha or maybe beta, in order to make the words really distinct/far apart

In [11]:
# bow = lda_model.id2word.doc2bow(texts) # convert to bag of words format first
# gamma, phis, other = lda_model.get_document_topics(bow, 

topics = model.show_topics()
# lda_model.show_topics()
topics

[(0,
  '0.027*"retali" + 0.025*"islam" + 0.022*"taliban" + 0.016*"oper" + 0.015*"isil" + 0.014*"iraq" + 0.013*"levant" + 0.012*"pakistan" + 0.012*"militari" + 0.012*"member"'),
 (1,
  '0.020*"group" + 0.017*"govern" + 0.014*"forc" + 0.013*"victim" + 0.011*"polic" + 0.011*"state" + 0.011*"militari" + 0.011*"secur" + 0.009*"armi" + 0.009*"peopl"'),
 (2,
  '0.059*"shiit" + 0.053*"violenc" + 0.049*"larger" + 0.049*"sunni" + 0.042*"trend" + 0.041*"communiti" + 0.039*"iraq" + 0.038*"sectarian" + 0.017*"suspect" + 0.014*"govern"')]

In [12]:
from pprint import pprint

# Print the Keyword in the 6 topics
#probability that the term belongs to the topic; prob of seeing that word given that topic
pprint(model.print_topics())
doc_lda = model[corpus]

[(0,
  '0.027*"retali" + 0.025*"islam" + 0.022*"taliban" + 0.016*"oper" + '
  '0.015*"isil" + 0.014*"iraq" + 0.013*"levant" + 0.012*"pakistan" + '
  '0.012*"militari" + 0.012*"member"'),
 (1,
  '0.020*"group" + 0.017*"govern" + 0.014*"forc" + 0.013*"victim" + '
  '0.011*"polic" + 0.011*"state" + 0.011*"militari" + 0.011*"secur" + '
  '0.009*"armi" + 0.009*"peopl"'),
 (2,
  '0.059*"shiit" + 0.053*"violenc" + 0.049*"larger" + 0.049*"sunni" + '
  '0.042*"trend" + 0.041*"communiti" + 0.039*"iraq" + 0.038*"sectarian" + '
  '0.017*"suspect" + 0.014*"govern"')]


In [13]:
### Compute Model Perplexity and Coherence Score (interpretability of the model)

In [14]:
# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.660648054366201

Coherence Score:  0.29984587666001555


In [15]:
### Visualize the topics-keywords

In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [17]:
# Source: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# So far you have seen Gensim’s inbuilt version of the LDA algorithm. Mallet’s version, however, often gives a better quality of topics. 
# Gensim provides a wrapper to implement Mallet’s LDA from within Gensim itself. 
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# install java development kit: https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html

mallet_path = '/Users/janestout/Dropbox/Galvanize/DSI/Capstones/Capstone2_working/GTD/mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=dictionary)

In [18]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  [('taliban', 0.10999818544728725),
   ('pakistan', 0.07461440754853929),
   ('kill', 0.046960624206133186),
   ('campaign', 0.04463799673380512),
   ('tehrik', 0.03908546543277082),
   ('secur', 0.03810560696788242),
   ('govern', 0.028488477590273998),
   ('forc', 0.028307022318998367),
   ('milit', 0.022246416258392306),
   ('afghan', 0.018435855561604066)]),
 (1,
  [('group', 0.0891415330026431),
   ('state', 0.08566566494080162),
   ('unit', 0.04804663456316304),
   ('respons', 0.040298345341974726),
   ('spokesperson', 0.039900068793222054),
   ('militari', 0.037293167746840945),
   ('reveng', 0.029436257648720085),
   ('attack', 0.026539700930518845),
   ('haram', 0.025453492161193383),
   ('boko', 0.025453492161193383)]),
 (2,
  [('victim', 0.0731634851073312),
   ('polic', 0.06933818613979054),
   ('posit', 0.054296962044120926),
   ('believ', 0.03617321547946223),
   ('intimid', 0.03609893782960707),
   ('assail', 0.033610636559459256),
   ('suspect', 0.0286711728440912

NameError: name 'id2word' is not defined

In [None]:


def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=processed_docs, start=2, limit=10, step=2)

In [None]:
# Show graph
limit=10; start=2; step=2;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.ylim(.25, .45)
# plt.legend(("coherence_values"), loc='best')
# plt.show()
plt.savefig('coherence')

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))