In [2]:
!pip install gensim



In [3]:
from gensim.corpora import Dictionary
from gensim.models import Phrases, LdaModel
from gensim.utils import simple_preprocess
import os
import pandas as pd
import numpy as np
import re

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
df_clean = pd.read_csv('/content/drive/MyDrive/FYP/Data/Islamophobic-Tweets/islamophobic-tweets-clean.csv')
df_clean['class'] = df_clean['class'].apply(lambda x : 0 if x == 0 else 1)
df_clean

Unnamed: 0,document,class
0,start early,0
1,jumaat kareem muslim brother world alhamdulill...,1
2,thorpe england head coach pakistan silverwood ...,0
3,plymouth cricket transmission,0
4,jungkook catch eat food ramadan imaam friend m...,1
...,...,...
8366,cricket scoreline,0
8367,qanon terrorist organisation trump well call m...,1
8368,franchise cricket tournament well fielding cpl...,0
8369,muslim apj abdul kalam sir terrorist eye proph...,1


In [25]:
from gensim.models import CoherenceModel

tokenized_data = df_clean['document'].apply(lambda x: simple_preprocess(x))

dictionary = Dictionary(tokenized_data)
print("Before removing extremes", len(dictionary))
dictionary.filter_extremes(no_below = 5, no_above = 0.50)
print("After removing extremes", len(dictionary))

corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]

Before removing extremes 9338
After removing extremes 1651


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

def doc_term_matrix(df):
    cv = CountVectorizer()

    # Form a document term matrix using the document column of df
    # dtm is a sparse matrix containing elements in the form: (row,col)->frequency
    dtm = cv.fit_transform(df['document'])

    # Print some info about our document-term matrix
    print("Total no. of documents:",dtm.shape[0])
    print("\nSize of the vocabulary:",dtm.shape[1],"\n")
    return cv,dtm

df_class_0 = df_clean[df_clean['class'] == 0]
df_class_1 = df_clean[df_clean['class'] == 1]

In [28]:
cv_clean, dtm_clean = doc_term_matrix(df_class_0)

# This is a list of the sum of freqs
sums = dtm_clean.sum(axis=0).tolist()[0]
# This is a list of the corresponding terms
terms = cv_clean.get_feature_names_out().tolist()
# Create a dataframe of terms and their frequency sum then sort it by the sum
sumdf = pd.DataFrame({'Term': terms, 'Freq Sum': sums})
sumdf.sort_values(by='Freq Sum', ascending=False, inplace=True)
sumdf.head(20)

Total no. of documents: 3493

Size of the vocabulary: 6023 



Unnamed: 0,Term,Freq Sum
1312,cricket,2340
4089,play,215
3931,pakistan,201
5344,test,164
2813,ipl,148
5298,team,138
1836,england,124
3395,match,113
1180,congress,112
1570,dhoni,104


In [29]:
cv_clean, dtm_clean = doc_term_matrix(df_class_1)

# This is a list of the sum of freqs
sums = dtm_clean.sum(axis=0).tolist()[0]
# This is a list of the corresponding terms
terms = cv_clean.get_feature_names_out().tolist()
# Create a dataframe of terms and their frequency sum then sort it by the sum
sumdf = pd.DataFrame({'Term': terms, 'Freq Sum': sums})
sumdf.sort_values(by='Freq Sum', ascending=False, inplace=True)
sumdf.head(20)

Total no. of documents: 4878

Size of the vocabulary: 5627 



Unnamed: 0,Term,Freq Sum
3399,muslim,1966
2173,happy,1377
3371,muharram,1297
2584,islamic,1253
3490,new,897
5064,terrorist,849
4090,radical,628
2667,jehadi,526
798,brother,517
5059,terrorism,353


In [32]:
ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, chunksize=2000, passes=10, iterations=20, alpha='auto', eta="auto", random_state=0)

In [33]:
coherence_model = CoherenceModel(model=ldamodel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
print("Umass:", coherence_model.get_coherence())
coherence_model = CoherenceModel(model=ldamodel, dictionary=dictionary, texts=tokenized_data, coherence='c_v')
print("Coherence:", coherence_model.get_coherence(), "\n")

Umass: -17.48143220866108
Coherence: 0.5762647097339983 



In [16]:
from gensim.test.utils import datapath
import os

folder_name = "lda-twitter-islamaphobia-80topics-standardfilter-10pass-20iters-0.499cv"
os.makedirs(f"/content/drive/MyDrive/FYP/Models/{folder_name}", exist_ok=True)

temp_file = datapath(f"/content/drive/MyDrive/FYP/Models/{folder_name}/model")
ldamodel.save(temp_file)