In [4]:
import re

import nltk
import numpy as np
import pandas as pd
from gensim import corpora
from gensim.models import CoherenceModel, LdaModel, Phrases, TfidfModel, phrases
from nltk import PorterStemmer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from utils import load_and_describe_raw_data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\40752\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\40752\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Basic operations
class ProcessData:
    def __init__(self,dataset):
        self.dataframe = dataset

    def eliminate_labels(self):
        self.dataframe.drop(columns = ['Id','Score','ViewCount','LabelNum'])

    def clean_text(self, text):
        pattern = re.compile('<[^>]*>|\'|\(|\)|\"|”|“|\?|\.|,|:|;|&|[|]|-|\\\\')
        text = text.lower()
        text = re.sub(pattern, " ", text);
        text = nltk.word_tokenize(text)
        stop_words = stopwords.words('english')
        text = [word for word in text if word not in stop_words]

        stemmer = PorterStemmer()
        try:
            text = [stemmer.stem(word) for word in text]
            text = [word for word in text if len(word) > 1]
        except IndexError:
            pass
        return text

    def merge_text_labels(self):
        self.dataframe['Content'] = self.dataframe['Title'] + self.dataframe['Body']
        self.dataframe['Content'] = self.dataframe['Content'].apply(self.clean_text)


    def eliminate_small_posts(self):
        self.dataframe = self.dataframe[self.dataframe['Content'].map(len) >= 30]
        self.dataframe = self.dataframe[self.dataframe['Content'].map(type) == list]
        self.dataframe.reset_index(drop=True, inplace=True)

In [5]:
# Process data
X_train,X_valid,X_test = load_and_describe_raw_data()
dataset_train = ProcessData(X_train)
dataset_train.eliminate_labels()
dataset_train.merge_text_labels()
dataset_train.eliminate_small_posts()

Labels:Index(['Id', 'Title', 'Body', 'Score', 'ViewCount', 'Label', 'LabelNum'], dtype='object')
See dataset balance:
Label
android    37153
ios        14217
Name: count, dtype: int64
       Id                                              Title  \
0  197234  Drop\stop mobile data connection (non-wifi) by...   
1  114800  How to automatically crop text messages when S...   
2  124532        Can't find text message that was to a group   
3  193875           Can't store contacts on my Android phone   
4   50332  Dropbox on Samsung Galaxy - where is the Setti...   

                                                Body  Score  ViewCount  \
0  <p>Can I set Android 4.4.2 to drop mobile data...      0         34   
1  <p>Is there a way to prevent the Messages app ...      0        836   
2  <p>When John Doe texts to a group that include...      1         28   
3  <p>I was going through all of my installed app...      0        158   
4  <p>On a Sony Xperia, the settings button in Dr...      1  

In [6]:
#Model class
class LDA():

    def __init__(self,data):
        self.data = data
        self.corpus = []
        self.id2word = None
        self.model = None


    def tfidf(self):
        self.id2word = corpora.Dictionary(self.data)
        print(self.id2word)
        for text in self.data:
            self.corpus.append(self.id2word.doc2bow(text))
        tfidf = TfidfModel(self.corpus,id2word=self.id2word)

        
        low_value = 0.025
        for i in range(0, len(self.corpus)):
            bow = self.corpus[i]
            low_value_words = [id for id, value in tfidf[bow] if value < low_value]
            new_bow = [b for b in bow if b[0] not in low_value_words]
            self.corpus[i] = new_bow

    def ngram(self):
        bigram = Phrases(self.data, min_count=5, threshold=100)  
        bigram_mod = phrases.Phraser(bigram)

        def make_bigrams(texts):
            return [bigram_mod[doc] for doc in texts]

        data_bigram = make_bigrams(self.data)
        print(data_bigram[0])
        self.id2word = corpora.Dictionary(data_bigram)
        for text in self.data:
            self.corpus.append(self.id2word.doc2bow(text))

        tfidf = TfidfModel(self.corpus,id2word=self.id2word)
        
        low_value = 0.25
        for i in range(0, len(self.corpus)):
            bow = self.corpus[i]
            low_value_words = [id for id, value in tfidf[bow] if value < low_value]
            new_bow = [b for b in bow if b[0] not in low_value_words]
            self.corpus[i] = new_bow

    def cluster(self,topic=2,a=1e-2,b=0.5e-2):
        self.model = LdaModel(corpus=self.corpus,
                            id2word=self.id2word,
                            num_topics=2,
                            alpha="asymmetric",
                            eta="symmetric",
                            chunksize=1000,
                            minimum_probability=0.0,
                            passes=2)
        return self.model

In [8]:
# LDA with Tf-idf
LDAmodel = LDA(X_train['Content'])
LDAmodel.tfidf()
LDAtest_tfidf = LDAmodel.cluster()
print(LDAtest.show_topics())
print('\nPerplexity: ', LDAtest_tfidf.log_perplexity(LDAmodel.corpus))
coherence_model_lda = CoherenceModel(model=LDAtest_tfidf, texts=LDAmodel.data, dictionary=LDAmodel.id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Dictionary<77449 unique tokens: ['app', 'automat', 'charact', 'charg', 'crop']...>
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\40752\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\40752\AppData\Local\Temp\ipykernel_17760\3668446276.py", line 5, in <module>
    print(LDAtest.show_topics())
          ^^^^^^^
NameError: name 'LDAtest' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\40752\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 2120, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\40752\anaconda3\Lib\site-packages\IPython\core\ultratb.py", line 1435, in structured_traceback
    return FormattedTB.structured_traceback(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\40752\anaconda3\Lib\site-packages\IPython\core\ultratb.py", line 1326, in st

In [None]:
#Test data
dataset_test = ProcessData(X_test)
dataset_test.eliminate_labels()
dataset_test.merge_text_labels()
dataset_test.eliminate_small_posts()

In [None]:
#Test
LDA_test_tf = LDA(dataset_test.dataframe['Content'])
LDA_test_tf.ngram()
y_pred_tf = []

for index,corpus in enumerate(LDA_test_tf.corpus):
    if 1 - LDAtest_tfidf[corpus][0][1] < 0.5:
        y_pred_tf.append(0)
    else:
        y_pred_tf.append(1)

accuracy= accuracy_score(dataset_test.dataframe['LabelNum'], y_pred_tf)
print (accuracy)
score = f1_score(dataset_test.dataframe['LabelNum'], y_pred_tf, average="macro")
print("Macro F1-Score: ", score)

In [None]:
#Visualize
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(LDAtest_tfidf, LDAmodel.corpus, LDAmodel.id2word)

In [None]:
#LDA with ngram
LDAmodel = LDA(dataset_train.dataframe['Content'])
LDAmodel.ngram()
LDAtest = LDAmodel.cluster()
print(LDAtest.show_topics())
print('\nPerplexity: ', LDAtest.log_perplexity(LDAmodel.corpus))
coherence_model_lda = CoherenceModel(model=LDAtest, texts=LDAmodel.data, dictionary=LDAmodel.id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
#Visualize
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(LDAtest, LDAmodel.corpus, LDAmodel.id2word)

In [None]:
# Test
dataset_test = ProcessData(X_test)
dataset_test.eliminate_labels()
dataset_test.merge_text_labels()
dataset_test.eliminate_small_posts()

In [None]:
#Test
LDA_test = LDA(dataset_test.dataframe['Content'])
LDA_test.ngram()
y_pred = []

for index,corpus in enumerate(LDA_test.corpus):
    if 1 - LDAtest[corpus][0][1] < 0.5:
        y_pred.append(0)
    else:
        y_pred.append(1)
 
accuracy= accuracy_score(dataset_test.dataframe['LabelNum'], y_pred)
print (accuracy)
score = f1_score(dataset_test.dataframe['LabelNum'], y_pred, average="macro")
print("Macro F1-Score: ", score)

In [None]:
#Hypertuning
LDAmodel = LDA(dataset_train.dataframe['Content'])
LDAmodel.ngram()

num_topics = [4,2,6,14]
alpha = ["asymmetric","symmetric",0.01,0.16,0.5,0.1]
eta = ["symmetric",0.01,0.5,0.16,1,0.01]
model_results = {
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

for topics in num_topics:
    for a in alpha:
        for b in eta:
            model = LDAmodel.cluster(topics,a,b)
            coherence_model_lda = CoherenceModel(model=model, texts=LDAmodel.data, dictionary=LDAmodel.id2word, coherence='c_v')
            coherence_lda = coherence_model_lda.get_coherence()
            model_results['Topics'].append(topics)
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(coherence_lda)
            print(coherence_lda)
pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)