In [1]:
!pip install gensim



In [2]:
from gensim.corpora import Dictionary
from gensim.models import LsiModel, Phrases, LdaModel, TfidfModel
from gensim.utils import simple_preprocess
import os
import pandas as pd
import numpy as np
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
pd.read_csv('/content/drive/MyDrive/FYP/Data/reddit-nonislamic/clean-v2.csv')

Unnamed: 0,document
0,instead make individually casserole ahead time...
1,amp peanut butter amp exactly customary countr...
2,like cook vegetable mix spaghetti sauce great ...
3,know meal grow eat take grant eventually reali...
4,pinze traditional easter bread area eastern eu...
...,...
49962,try alpha male overload make statement force h...
49963,hey currently artist entertainment industry wa...
49964,slowly recover low injury past month give oppo...
49965,title describe lift year probably get big prob...


In [4]:
df_clean = pd.read_csv('/content/drive/MyDrive/FYP/Data/train-test-data/train-clean-customstopwords.csv')
df_clean

Unnamed: 0,document,class
0,time movie tie rush lazy cash grab company exp...,0
1,visit spiritual center yesterday brahma kumari...,0
2,major belief system hindu bad describe name we...,0
3,yesterday possibly form group online expect ma...,0
4,salaam alaikum spiritual young man mormon memb...,1
...,...,...
89869,despair seclude private consultation eld fathe...,1
89870,follow lengthy start paragraph turn proud form...,0
89871,party yesterday conversation friend wickedpixe...,0
89872,core belief islam iman acknowledge god sinceri...,1


In [5]:
from gensim.models import CoherenceModel

tokenized_data = df_clean['document'].apply(lambda x: simple_preprocess(x))

dictionary = Dictionary(tokenized_data)
print("Before removing extremes", len(dictionary))
dictionary.filter_extremes(no_below = 5, no_above = 0.50)
print("After removing extremes", len(dictionary))

corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]

Before removing extremes 141726
After removing extremes 33704


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

def doc_term_matrix(df):
    cv = CountVectorizer()

    # Form a document term matrix using the document column of df
    # dtm is a sparse matrix containing elements in the form: (row,col)->frequency
    dtm = cv.fit_transform(df['document'])

    # Print some info about our document-term matrix
    print("Total no. of documents:",dtm.shape[0])
    print("\nSize of the vocabulary:",dtm.shape[1],"\n")
    return cv,dtm

df_class_0 = df_clean[df_clean['class'] == 0]
df_class_1 = df_clean[df_clean['class'] == 1]

In [8]:
cv_clean, dtm_clean = doc_term_matrix(df_class_0)

# This is a list of the sum of freqs
sums = dtm_clean.sum(axis=0).tolist()[0]
# This is a list of the corresponding terms
terms = cv_clean.get_feature_names_out().tolist()
# Create a dataframe of terms and their frequency sum then sort it by the sum
sumdf = pd.DataFrame({'Term': terms, 'Freq Sum': sums})
sumdf.sort_values(by='Freq Sum', ascending=False, inplace=True)
sumdf.head(20)

Total no. of documents: 49896

Size of the vocabulary: 105179 



Unnamed: 0,Term,Freq Sum
92987,time,33986
52664,live,29467
22071,day,23561
37016,god,22912
103109,work,20396
87372,start,16404
35204,game,16206
52174,life,14667
53333,love,14354
62366,new,14306


In [9]:
cv_clean, dtm_clean = doc_term_matrix(df_class_1)

# This is a list of the sum of freqs
sums = dtm_clean.sum(axis=0).tolist()[0]
# This is a list of the corresponding terms
terms = cv_clean.get_feature_names_out().tolist()
# Create a dataframe of terms and their frequency sum then sort it by the sum
sumdf = pd.DataFrame({'Term': terms, 'Freq Sum': sums})
sumdf.sort_values(by='Freq Sum', ascending=False, inplace=True)
sumdf.head(20)

Total no. of documents: 39978

Size of the vocabulary: 70370 



Unnamed: 0,Term,Freq Sum
2902,allah,49273
41591,muslim,47833
30916,islam,34580
62850,time,28957
49809,quran,22196
15026,day,18765
48505,prophet,18249
24076,god,18002
35649,life,17641
49686,question,14875


In [None]:
ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=80, chunksize=2000, passes=10, iterations=20, alpha='auto', eta="auto", random_state=0)

In [None]:
coherence_model = CoherenceModel(model=ldamodel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
print("Umass:", coherence_model.get_coherence())
coherence_model = CoherenceModel(model=ldamodel, dictionary=dictionary, texts=tokenized_data, coherence='c_v')
print("Coherence:", coherence_model.get_coherence(), "\n")

Umass: -3.697778306216916
Coherence: 0.5609450785185486 



In [None]:
for i in range(70, 101, 1):
    ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=i, chunksize=2000, passes=10, iterations=50, alpha='auto', eta="auto", random_state=0)
    coherence_model = CoherenceModel(model=ldamodel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    print(f"Topic {i}")
    print("Umass:", coherence_model.get_coherence())
    coherence_model = CoherenceModel(model=ldamodel, dictionary=dictionary, texts=tokenized_data, coherence='c_v')
    print("Coherence:", coherence_model.get_coherence(), "\n")

Topic 70
Umass: -3.7941830837141945
Coherence: 0.5510791258383062 

Topic 71
Umass: -3.6085525836968277
Coherence: 0.5796698674746138 

Topic 72
Umass: -3.9447324709466747
Coherence: 0.5346437077728535 

Topic 73
Umass: -3.838183246708077
Coherence: 0.5427662125037066 

Topic 74
Umass: -3.7228660995819403
Coherence: 0.5521122754727319 

Topic 75
Umass: -3.805746444218769
Coherence: 0.5564653464496686 

Topic 76
Umass: -3.746197013360337
Coherence: 0.5667846130563118 

Topic 77
Umass: -3.7410546777617886
Coherence: 0.5500397091901911 



In [None]:
from gensim.test.utils import datapath
import os

folder_name = "lda-combinedv2-80topics-standardfilter-10pass-20iters-0.56cv"
os.makedirs(f"/content/drive/MyDrive/FYP/Models/{folder_name}", exist_ok=True)

temp_file = datapath(f"/content/drive/MyDrive/FYP/Models/{folder_name}/model")
ldamodel.save(temp_file)

# Stuff

In [None]:
# islamic cc
# nonislamic c
# 20 iterations, 10 passes

Topic 30
Umass: -2.3318985095320834
Coherence: 0.6389016457574692

Topic 35
Umass: -2.7445361030286035
Coherence: 0.611417395143216

Topic 40
Umass: -3.218818926133237
Coherence: 0.6126672395213195

Topic 45
Umass: -3.190339735724582
Coherence: 0.6216660621555852

Topic 50
Umass: -3.315046098300495
Coherence: 0.5826914147005909

Topic 55
Umass: -3.6968907142596326
Coherence: 0.5876203689564127

Topic 60
Umass: -3.5163564764897166
Coherence: 0.589442844735515

Topic 65
Umass: -3.6201825440754796
Coherence: 0.5939832345741837

Topic 70
Umass: -3.753999746030975
Coherence: 0.5785155194243962

Topic 75
Umass: -4.011428225755204
Coherence: 0.5497337451235206

In [None]:
Topic 50
Umass: -3.4849449955690743
Coherence: 0.5978870112758177

Topic 51
Umass: -3.383653020438186
Coherence: 0.5815379170295788

Topic 52
Umass: -3.4791071890971037
Coherence: 0.5833348972331247

Topic 53
Umass: -3.043295326726094
Coherence: 0.6120812427038057

Topic 54
Umass: -3.191870921967857
Coherence: 0.602310002394736

Topic 55
Umass: -3.6082071587230313
Coherence: 0.5694278995391375

Topic 56
Umass: -3.4036553913342686
Coherence: 0.5923411330844591

Topic 57
Umass: -3.8994160460369622
Coherence: 0.5645958781467241

Topic 58
Umass: -3.4862583780142673
Coherence: 0.5963657141699222

Topic 59
Umass: -3.7614072027160756
Coherence: 0.5693762708775798

Topic 60
Umass: -3.347049390072008
Coherence: 0.5880605130027443

Topic 61
Umass: -3.3486813444631522
Coherence: 0.5956209356680653

Topic 62
Umass: -3.4943289843533725
Coherence: 0.5798419840887635

Topic 63
Umass: -3.896075808512371
Coherence: 0.5693331552951464

Topic 64
Umass: -4.001799307982365
Coherence: 0.5545774260866119

Topic 65
Umass: -3.648739356642238
Coherence: 0.5726094041318446

Topic 66
Umass: -3.8119825026727234
Coherence: 0.560302035224527

Topic 67
Umass: -4.1115637358738715
Coherence: 0.5450382896863778

Topic 68
Umass: -3.868882639974538
Coherence: 0.5514457760212226

Topic 69
Umass: -3.759433357614923
Coherence: 0.5693134895700054

Topic 71
Umass: -4.060752163815541
Coherence: 0.5587661146348574

Topic 72
Umass: -4.067399107287997
Coherence: 0.5740528088835077