# Topic Modelling on terroristic tweets

## Business Understanding
* Dataset: Data from https://www.kaggle.com/datasets/fifthtribe/how-isis-uses-twitter
* Key models:
    1. Data Processing: Bag of words (BOW)
    2. Topic Modelling: LDA, LSA and BERTopic

In [1]:
# For consistency
num_topics = 7
num_words = 25

seed = 42
# Output folder
out_dir = 'final_topicmodel_output'

### Imports

In [2]:
import pandas as pd
pd.set_option('display.precision', 5)
import numpy as np
import scipy
from scipy.stats import randint, uniform
from functools import partial
import collections
from collections import defaultdict
import os
import re #regex
import pickle
import joblib
from tqdm import tqdm
import datetime

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
custom_params = {'figure.figsize':(12,6), "axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

#Machine Learning imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical 
import keras_tuner as kt

import torch
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# NLTK imports
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt') # for the tokenizer
nltk.download('averaged_perceptron_tagger') # for POS
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem.porter import PorterStemmer

# Gensim imports
import gensim
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
# from gensim.models.wrappers import LdaMallet
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim import similarities

#from bertopic import BERTopic

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Initialize spacy 'en' model, with only tagger component (for efficiency)
import spacy
nlp = spacy.load("en_core_web_lg", disable=['parser', 'ner'])

# Create the output directory
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20184116\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20184116\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\20184116\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20184116\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\20184116\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  from imp import reload


## Data Understanding

In [3]:
# Tweets 
import pandas as pd
data_df = pd.read_csv('tweets.csv', index_col=0)
data_df.dropna(inplace = True)
data_df.reset_index(drop=True, inplace=True)
data_df.drop(inplace=True, columns=['followers', 'numberstatuses', 'time', 'location'], axis=1)

data = data_df


In [4]:
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11280 entries, 0 to 11279
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   username     11280 non-null  object
 1   description  11280 non-null  object
 2   tweets       11280 non-null  object
dtypes: object(3)
memory usage: 264.5+ KB
None


Unnamed: 0,username,description,tweets
0,AbuLaythAlHindi,Kik: abulayth2014. Ex South African. Currently...,‍‌‌‌‌‍‌‌‌🔻‌‌[ الإثنين 1436/9/19 هـ ]‍‌‌‌‌‍‍‍‍...
1,AbuLaythAlHindi,Kik: abulayth2014. Ex South African. Currently...,#BREAKING #CONFIRMED Islamic State takes contr...
2,warreporter2,"Reporting, analysing and discussing conflicts ...",@hxhassan You apostate how many re tweets did...
3,warreporter2,"Reporting, analysing and discussing conflicts ...",@hxhassan you kafir your burning aren't you th...
4,warreporter2,"Reporting, analysing and discussing conflicts ...",@jsiwat LOL but you saudi arabian tawagheet we...


## Data Preparation

In [49]:
stop_words = stopwords.words('english')
stop_words.extend(['see', 'say', 'said', 'could', 'make', 'made', 'want', 'use', 'get', 'put', 'take', 'would', 'may', 'go', 'find', 'become', 'refer', 'come', 'give',
                   'also', 'suddenly', 'right', 'like', 'il', 'back', 'o', 'herrein', 'self', 'really',
                   'one', 'many', 'several', 'line', 'cite', 'inc', 'first', 'reuters', 'zacks', 'nick', 'dim', 'carl', 'u',
                   'day', 'week', 'month', 'quarter', 'estimate', 'year', 'per', 'eps', 'last', 'past', 'long', 'since', 'today',
                  'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saterday', 'sunday'
                  'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'https', 'u'
                   'de', 'le', 'la', 'est', 'un', 'et', 'je', 'pa', 'pour', 'en', 'qui', 'que', 'dans', 'isi',
                   'va', 'du', 'une', 'avec', 'ce', 'أعماق', 'من', 'هذا', 'اللي', 'لا', 'أبو', 'اللي', 'التي', 'حتى', 'شمال', 'أو', 'على', 
                    'أنا', 'sz', 'هذه', 'كان',]) 

# translations of these arabic words are: أعماق - Depths, من - From, هذا - This, الا - No,
# اللي - Who/That, التي - Which, حتى - Until, أو - Or, على - On, أنا - I, هذه - This, كان - Was

keep_list = [] # ['no', 'not'] 
for word in keep_list:
    try:
        stop_words.remove(word)
    except:
        pass

# Checking if there are still words in the list:
# any(item in keep_list for item in stop_words)

def preprocess_text(text, normalize = True, remove_stopwords = True, stop_words = stop_words, 
               tokenizer = False, lemmatizer = WordNetLemmatizer()):
    '''Return tokenized text, with the options:
    Normalization, Removal of unwanted characters, stopwords
    Attenttion: Lemmatizer needs tokenizer
    '''
    # Separate words capitalized together eg. NewsToday -> News Today
    text = re.sub( r"([A-Z]+[a-zA-Z])", r" \1", text)
    
    if normalize: # Convert words to lower case
        text = text.lower()
    
    # Tabs
    text = re.sub(r"[\n][\t]*|[\t]", " ", text) 
    # Puctuation
    #text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', '', text)
    # Numbers
    text = re.sub(r"[0-9]", "",text)
    # too long words: Incomprehensibilities is longest word “in common usage" or single letters
    text = re.sub(r"\b\w{25,90}\b|\b\w{1}\b", "",text)
    # Removing mentions (@person) and Hashtags #topic, 
    #text = re.sub(r"(@|#\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|^rt", "", text)
    # HTML
    #text = re.sub(r'(\<a href)|(<br />)', '', text)
    # Links: http://
    #text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)    
    # Extra space
    #text = re.sub('\s+',' ',text)
    
    # Tokenizer    
    tokens = nltk.word_tokenize(text)
    
    if remove_stopwords:
        text = " ".join([word for word in text.split() if word not in (stop_words)])
    
    if remove_stopwords:
        tokens = [word for word in tokens if word not in (stop_words)]

    if lemmatizer != False: # Lemmatization
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
            
    if tokenizer:
        return tokens
    
    else:
        text = " ".join([word for word in tokens])
        # Extra space
        text = re.sub(r"\s+"," ",text)
        return text

text_tweets = data['tweets'].apply(lambda x: preprocess_text(x))
text_tweets.reset_index(drop = True, inplace = True)

# Tokenizing
text_tweets_tokenized = text_tweets.apply(lambda x: re.compile(r'\w+').findall(x))

In [50]:
def prepare_corpus(text_tweets_tokenized):

    """
  Input  : clean document
  Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
  Output : term dictionary and Document Term Matrix
  """
  # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    # be sure to split sentence before feed into Dictionary
#     doc_clean = [d.split() for d in doc_clean]
    dictionary = corpora.Dictionary(text_tweets_tokenized)
    #dic = corpora.Dictionary([a.split()])

  # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_tweets_tokenized]
  
  # generate LSA model
    return dictionary,doc_term_matrix

dictionary, doc_term_matrix = prepare_corpus(text_tweets_tokenized)
print(f"Len dictionary: {len(dictionary)}\n")
dictionary.most_common(20)

Len dictionary: 27246



[('co', 5289),
 ('rt', 3216),
 ('isi', 1767),
 ('al', 1442),
 ('syria', 1341),
 ('killed', 886),
 ('amp', 801),
 ('iraq', 644),
 ('army', 625),
 ('assad', 620),
 ('u', 568),
 ('state', 505),
 ('breaking', 494),
 ('islamic', 486),
 ('aleppo', 476),
 ('news', 473),
 ('attack', 453),
 ('soldier', 423),
 ('de', 414),
 ('war', 390)]

In [51]:
# https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

def lemmatization(text_tweets_tokenized, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    text_lemmatized = []
    for sent in text_tweets_tokenized:
        doc = nlp(" ".join(sent))
        if allowed_postags:
            text_lemmatized.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        else:
            text_lemmatized.append([token.lemma_ for token in doc])
    return text_lemmatized

# Do lemmatization keeping only noun, adj, vb, adv
text_lemmatized = lemmatization(text_tweets_tokenized, allowed_postags=False)

print(text_lemmatized[:1][0][:30])

['الإثنين', 'هـ', 'الن', 'شرة', 'صوتي', 'مكتوبة', 'co', 'gvzl', 'evt', 'دولة_الخلافة']


The translations of the above arabic words are: 
الإثنين - Monday
هـ - Hijri calendar (Islamic calendar) era notation
الن - The (incomplete word, context needed)
شرة - News (contextual meaning, can also mean "marketplace" or "pleasure" in different contexts)
صوتي - Audio
مكتوبة - Written
co - Abreviation
gvzl - Abreviation
evt - Abreviation
دولة_الخلافة - Caliphate State

## Modelling

### Topic models
From NLP course session 6 by Dr. Gemma Catolino and Nemania Borovits.

#### LDA

In [23]:
def create_gensim_lda_model(num_topics = num_topics, dictionary = dictionary, 
                            doc_term_matrix = doc_term_matrix, num_words = num_words):
    """
  Input  : clean text, number of topics and number of words associated with each topic
  Purpose: create LDA model using gensim
  Output : return LDA model
  """
    
  # Generate LDA model
    ldamodel = LdaModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary,passes =10, update_every = 1, chunksize=10, alpha='auto')  # Train model
    
    lda_topic_words = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
    print(lda_topic_words)
    for i,topic in enumerate(lda_topic_words):
        print('Topic ' , i , ':', re.findall(r'"(.*?)"', topic[1]))
        
    return ldamodel

lda_model = create_gensim_lda_model(num_topics, dictionary, doc_term_matrix, num_words)

[(0, '0.061*"iraq" + 0.031*"muslim" + 0.023*"ramus" + 0.023*"lolah" + 0.018*"dead" + 0.016*"saudi" + 0.014*"know" + 0.014*"top" + 0.013*"ago" + 0.011*"using" + 0.009*"government" + 0.009*"sam" + 0.009*"uncle_" + 0.009*"coco" + 0.008*"plan" + 0.008*"place" + 0.007*"never" + 0.007*"khilafah" + 0.007*"please" + 0.007*"die" + 0.006*"year" + 0.006*"minister" + 0.006*"community" + 0.006*"post" + 0.005*"ah"'), (1, '0.048*"abu" + 0.037*"source" + 0.031*"west" + 0.021*"في" + 0.020*"medium" + 0.014*"join" + 0.009*"الله" + 0.008*"noire" + 0.007*"say" + 0.007*"seen" + 0.005*"banniere" + 0.003*"witness" + 0.003*"خريطة" + 0.003*"kasimf" + 0.003*"أردوغان" + 0.003*"أوغلو" + 0.003*"العراق" + 0.003*"العسكري" + 0.003*"ما" + 0.002*"قريب" + 0.002*"هم" + 0.002*"البنتاغون" + 0.002*"للأنبار" + 0.002*"القائد" + 0.002*"وهيب"'), (2, '0.054*"rt" + 0.036*"isi" + 0.022*"killed" + 0.016*"soldier" + 0.014*"amaq" + 0.014*"agency" + 0.013*"army" + 0.012*"near" + 0.011*"amp" + 0.011*"iraqi" + 0.010*"aleppo" + 0.010*"hom

In [24]:
def create_gensim_ldamc_model(num_topics = num_topics, dictionary = dictionary, 
                            doc_term_matrix = doc_term_matrix, num_words = num_words):
    """
  Input  : clean text, number of topics and number of words associated with each topic
  Purpose: create LDA model using gensim
  Output : return LDA model
  """
    
  # Generate LDA model
    ldamc = LdaMulticore(doc_term_matrix, num_topics=num_topics, id2word = dictionary,passes =10, chunksize=10)  # Train model
    
    ldamc_topic_words = ldamc.print_topics(num_topics=num_topics, num_words=num_words)
    print(ldamc_topic_words)
    for i,topic in enumerate(ldamc_topic_words):
        print('Topic ' , i , ':', re.findall(r'"(.*?)"', topic[1]))
        
    return ldamc

ldamc_model = create_gensim_ldamc_model(num_topics, dictionary, doc_term_matrix, num_words)

[(0, '0.039*"co" + 0.035*"rt" + 0.027*"al" + 0.022*"soldier" + 0.015*"isi" + 0.015*"syrian" + 0.014*"iraq" + 0.012*"people" + 0.011*"syria" + 0.008*"amp" + 0.008*"ramus" + 0.008*"abu" + 0.008*"lolah" + 0.008*"في" + 0.008*"destroyed" + 0.007*"fight" + 0.006*"allah" + 0.006*"assad" + 0.006*"vso" + 0.005*"world" + 0.005*"anbar" + 0.004*"israeli" + 0.004*"support" + 0.004*"man" + 0.004*"saa"'), (1, '0.024*"co" + 0.022*"amaq" + 0.022*"agency" + 0.021*"city" + 0.013*"area" + 0.011*"clash" + 0.010*"wilayat" + 0.010*"time" + 0.009*"yesterday" + 0.008*"east" + 0.008*"الله" + 0.008*"rt" + 0.006*"targeted" + 0.006*"body" + 0.006*"khan" + 0.005*"weapon" + 0.005*"martyrdom" + 0.005*"map" + 0.005*"iranian" + 0.004*"free" + 0.004*"caliphate_" + 0.004*"region" + 0.003*"province" + 0.003*"used" + 0.003*"security"'), (2, '0.054*"co" + 0.020*"ypg" + 0.014*"captured" + 0.013*"al" + 0.011*"rt" + 0.009*"killing" + 0.009*"leader" + 0.008*"southern" + 0.008*"bombing" + 0.008*"nusra" + 0.007*"group" + 0.007*"h

#### LSA

In [25]:
def create_gensim_lsa_model(num_topics = num_topics, dictionary = dictionary, 
                            doc_term_matrix = doc_term_matrix, num_words = num_words):

    """
  Input  : clean document, number of topics and number of words associated with each topic
  Purpose: create LSA model using gensim
  Output : return LSA model
  """
 
  # Generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary)  # Train model
    
    lsa_topic_words = lsamodel.print_topics(num_topics=num_topics, num_words=num_words)
    print(lsa_topic_words)
    for i,topic in enumerate(lsa_topic_words):
        print('Topic ' , i , ':', re.findall(r'"(.*?)"', topic[1]))

    return lsamodel

lsa_model=create_gensim_lsa_model(num_topics, dictionary, doc_term_matrix, num_words)

  sparsetools.csc_matvecs(


[(0, '0.799*"co" + 0.390*"rt" + 0.220*"isi" + 0.173*"al" + 0.158*"syria" + 0.094*"killed" + 0.078*"amp" + 0.071*"iraq" + 0.068*"assad" + 0.064*"army" + 0.058*"state" + 0.057*"islamic" + 0.056*"u" + 0.051*"attack" + 0.050*"aleppo" + 0.045*"news" + 0.044*"soldier" + 0.043*"breaking" + 0.041*"near" + 0.037*"force" + 0.035*"rebel" + 0.034*"russia" + 0.033*"war" + 0.033*"usa" + 0.032*"syrian"'), (1, '0.660*"rt" + -0.566*"co" + 0.293*"al" + 0.152*"isi" + 0.148*"syria" + 0.131*"amp" + 0.102*"killed" + 0.096*"ramus" + 0.094*"lolah" + 0.083*"breaking" + 0.077*"assad" + 0.059*"army" + 0.054*"nidalgazaui" + 0.047*"aleppo" + 0.045*"attack" + 0.041*"iraq" + 0.036*"near" + 0.034*"ypg" + 0.034*"https" + 0.034*"news" + 0.033*"rebel" + 0.032*"war" + 0.032*"u" + 0.031*"report" + 0.030*"soldier"'), (2, '0.632*"isi" + -0.548*"rt" + 0.288*"syria" + 0.216*"al" + 0.155*"army" + 0.128*"killed" + 0.126*"iraq" + -0.122*"co" + 0.110*"assad" + 0.073*"news" + 0.072*"amp" + 0.068*"breaking" + 0.058*"soldier" + 0.05

#### Bertopic

Based on the code from https://github.com/MaartenGr/BERTopic/blob/master/notebooks/BERTopic.ipynb and https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6

In [26]:
from bertopic import BERTopic

In [27]:
# Create the right input format for BERTopic
bert_string = [' '.join(item) for item in text_lemmatized]
bert_string[1]

'break confirm islamic state take control al jusiya border post link jurud al qaa lebanon qusayr homs countryside'

In [102]:
# Train BERTopic
bert_model = BERTopic(language="english")
topics, probs = bert_model.fit_transform(bert_string)

In [103]:
# The most frequent topic that was generated (-1 refers to all outliers and should typically be ignored):
bert_model.get_topic_freq().head(10)

Unnamed: 0,Topic,Count
0,-1,5183
1,0,454
2,1,353
3,2,275
4,3,229
5,4,205
6,5,185
7,6,162
8,7,138
9,8,138


In [104]:
# Show 10 most coherent topics
bert_model.get_topic(8)[:10]

[('aleppo', 0.07632757794822638),
 ('tuman', 0.03690343600817106),
 ('khan', 0.03473459404550309),
 ('southern', 0.034205884215279585),
 ('recapture', 0.03306420278537196),
 ('rebel', 0.03286930458797517),
 ('north', 0.03208290516862498),
 ('fateh', 0.031068147546317686),
 ('jaish', 0.028939550443184848),
 ('northern', 0.027085834471829492)]

In [105]:
# To access the predicted topics for the first 10 documents
bert_model.topics_[:10]

[2, -1, -1, -1, 18, 79, -1, -1, -1, -1]

In [106]:
bert_model.visualize_topics()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [107]:
similar_topics, similarity = bert_model.find_topics("gpu", top_n=5); similar_topics # Does not seem so see code below?

[110, 15, 135, 105, 71]

In [128]:
bert_model.get_topic(10)

[('muslim', 0.06646840587192938),
 ('islam', 0.054076591175014434),
 ('allah', 0.023972171115800717),
 ('kuffar', 0.023138815675816903),
 ('enemy', 0.022154090001847353),
 ('mujahideen', 0.02157497735235504),
 ('help', 0.021052344404251733),
 ('sharia', 0.01875174504762974),
 ('hate', 0.018707771547675816),
 ('rule', 0.018527747689969225)]

In [90]:
bert_model.get_topic(6)

[('hacker', 0.31849308161651246),
 ('hack', 0.24612804632064902),
 ('hacking', 0.14930926436852018),
 ('password', 0.10436554759226803),
 ('backdoor', 0.09953950957901346),
 ('co', 0.09529017756190662),
 ('isi', 0.08798624332834515),
 ('course', 0.07480371520320717),
 ('government', 0.06539913344683743),
 ('see', 0.0593951781292665)]

In [68]:
bert_model.get_topic(5) 

[('block', 1.1757959899805615),
 ('abu', 0.4430308315794197),
 ('list', 0.16987647660194444),
 ('barakallahu', 0.1261303367632595),
 ('mycatsarecool', 0.1261303367632595),
 ('knot', 0.1261303367632595),
 ('correctly', 0.1261303367632595),
 ('caucasian', 0.1261303367632595),
 ('zahit', 0.1261303367632595),
 ('teamblockedbyhater', 0.1261303367632595)]

In [109]:
# Save model
bert_model.save("bert_model")
# Load model
# my_model = BERTopic.load("bert_model")

### Model Selection and Evaluation

In [30]:
# Choosing the number of topics through the elbow technique.
# choose a point after which the diminishing increase of coherence score is no longer worth the additional increase of the number of topics. 
# topic_num = 7

In [63]:
def compute_coherence_values(dictionary=dictionary, doc_term_matrix=doc_term_matrix, text_lemmatized = text_lemmatized, 
                             num_words= 25, start=2, seed = seed, stop=5, tm='lsa'):

    """
  Input   : dictionary : Gensim dictionary
            corpus : Gensim corpus
            texts : List of input texts
            stop : Max num of topics
  purpose : Compute c_v coherence for various number of topics
  Output  : model_list : List of LSA topic models
            coherence_values : Coherence values corresponding to the LDA model with respective number of topics
  """
    coherence_values = []
    model_list = []

    for topic_num in range(2, stop):                      
        if tm == 'lsa':
            model = LsiModel(doc_term_matrix, num_topics = topic_num, id2word = dictionary)  # train model
        else:
            model = LdaModel(doc_term_matrix, num_topics=topic_num, id2word = dictionary,
                             passes =20, iterations = 20, update_every = 1, chunksize=2000, alpha='auto')

        model_list.append(model)
        
        topics = []
        topics_raw = model.show_topics(num_words = num_words, formatted = False)
        for topic_num, content in topics_raw:
            topics.append(list(dict(content).keys()))
            
        coherencemodel = CoherenceModel(model=model, texts=text_lemmatized, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
  
    coherence_df = pd.DataFrame(list(zip(model_list,coherence_values)), columns=['model','coherence']).sort_values(by = 'coherence', ascending = False)
    model = coherence_df.iloc[0]['model']
    print(f"Best coherence at {coherence_df.iloc[0]['model']} with {coherence_df.iloc[0]['coherence']:.3f} coherence\n")
    
    plt.plot(range(2,stop), coherence_values)
    plt.title('Number of Topics with Coherence score')
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.savefig('{}/{}_coherence.pdf'.format(out_dir, tm), dpi=300)
    plt.show()
    
    return model, coherence_df                         

In [67]:
#model_lda, coherence_df_lda = compute_coherence_values(tm = 'lda')

In [61]:
#lda_mc_model, coherence_df_lda = compute_coherence_values(tm = 'lda')

In [62]:
#lsa_model, coherence_df_lsa = compute_coherence_values(tm = 'lsa')

#### Model visualisation and Evaluation

In [39]:
# Visualize the topics for LDA
def vis_topic (model, doc_term_matrix, dictionary):
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(model, doc_term_matrix, dictionary)
    pyLDAvis.save_html(vis, '{}/ldavis'.format(out_dir) + '.html')
    return vis

In [40]:
vis_topic(ldamc_model, doc_term_matrix, dictionary)

In [57]:
# Coherence per topic for LSA model
topics = []
topics_raw = lsa_model.show_topics(num_words = num_words, formatted = False)
for k, content in topics_raw:
    topics.append(list(dict(content).keys()))

cm = CoherenceModel(topics=topics, texts=text_lemmatized, coherence='c_v',  dictionary=dictionary)

coherence_per_topic = cm.get_coherence_per_topic()

topics_str = [ ' '.join(t) for t in topics ]
data_topic_score = pd.DataFrame(data=zip(topics_str, coherence_per_topic), columns=['Topic', 'Coherence'] )

#data_topic_score


divide by zero encountered in double_scalars


invalid value encountered in double_scalars



In [58]:
# Coherence per topic for LDA model
topics = []
topics_raw = lda_model.show_topics(num_words = num_words, formatted = False)
for k, content in topics_raw:
    topics.append(list(dict(content).keys()))

cm = CoherenceModel(topics=topics, texts=text_lemmatized, coherence='c_v',  dictionary=dictionary)

coherence_per_topic = cm.get_coherence_per_topic()

topics_str = [ ' '.join(t) for t in topics ]
data_topic_score = pd.DataFrame(data=zip(topics_str, coherence_per_topic), columns=['Topic', 'Coherence'] )

#data_topic_score


divide by zero encountered in double_scalars


invalid value encountered in double_scalars



In [59]:
# Coherence per topic for LDA Multicore model
topics = []
topics_raw = ldamc_model.show_topics(num_words = num_words, formatted = False)
for k, content in topics_raw:
    topics.append(list(dict(content).keys()))

cm = CoherenceModel(topics=topics, texts=text_lemmatized, coherence='c_v',  dictionary=dictionary)

coherence_per_topic = cm.get_coherence_per_topic()

topics_str = [ ' '.join(t) for t in topics ]
data_topic_score = pd.DataFrame(data=zip(topics_str, coherence_per_topic), columns=['Topic', 'Coherence'] )

#data_topic_score


divide by zero encountered in double_scalars


invalid value encountered in double_scalars



## Store topics
Store the topics in a .CSV file

In [65]:
def get_top_topics(bert_model, top_n=10):
    topics = []
    for i in range(1, top_n+1):
        topic = bert_model.get_topic(i)
        if topic:
            topics.append(topic)
    return topics

top_topics = get_top_topics(bert_model, top_n=10)

# Print the top topics
for topic in top_topics:
    print(topic)


[('de', 0.19227587139255753), ('le', 0.09329993164494922), ('est', 0.07327551527516411), ('pa', 0.07033660019192547), ('il', 0.04905800181240652), ('mais', 0.044085287239210895), ('sont', 0.04006429348821551), ('sa', 0.03812442323437658), ('tu', 0.03657051044197577), ('aussi', 0.032637027091420835)]
[('gaza', 0.08676822802842761), ('hamas', 0.07396478541356258), ('shia', 0.0737537173779271), ('sunni', 0.06025861952805246), ('co', 0.05865011952992209), ('israeli', 0.05858939546703095), ('israel', 0.05497709429786765), ('militia', 0.048290690570116014), ('rt', 0.043281872869154255), ('palestinian', 0.037230619590520264)]
[('ibn', 0.29150823861963854), ('quran', 0.10479466438731974), ('ibneaadam', 0.06007822379798517), ('hijaz', 0.06007822379798517), ('rt', 0.05964773661546211), ('al', 0.0575194565676869), ('qayyim', 0.05405994635637245), ('el', 0.046267967078042725), ('allh', 0.045205122856614405), ('fear', 0.04136198062557789)]
[('leithfadel', 0.8195885317916611), ('syrmukhabarat', 0.33

In [66]:
import csv

# data_objects = [
#     [('taliban', 0.21859648845049312),
#      ('kashmir', 0.11698981062656587),
#      ('afghanistan', 0.09039690629917005)],
#     [('pakistan', 0.085052315888115),
#      ('afghan', 0.06400937027709527),
#      ('http', 0.060175426989499306)],
#     [('co', 0.05744136003462341),
#      ('kashmiri', 0.04222276861708699),
#      ('khaama', 0.038010120892845445)]
# ]

filename = 'topic_words.csv'

# Save the data to a CSV file
with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['word', 'score'])  # Write the header

    for data in top_topics:
        writer.writerows(data)  # Write the data rows for each data object

print(f"The data has been saved to {filename}.")


The data has been saved to topic_words.csv.
