In [2]:
import pymc as pm
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from pymc.CommonDeterministics import CompletedDirichlet as CompDir
from pymc import Dirichlet as Dir
from pymc import Categorical as Cat
from pymc import Model, MCMC
import math
## This is to create a small dataset from the bbc new dataset
import nltk
import gensim
from gensim.parsing.preprocessing import STOPWORDS
np.random.seed(42)
from collections import Counter
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/tudor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
class LDA(object):
    def __init__(self, data, nr_of_topics, a =1, b=1, iterations=4000):

        self.K = nr_of_topics
        self.M = len(data) # nr_of_documents
#         print(self.M)
        self.N = [len(document) for document in data] # nr of words for each document
        self.iter = iterations
        self.burnin = iterations/4
        
        # create a list of unique words
        words = []
        for document in data:
            for word in document:
                if word not in words:
                    words.append(word)
        
        
        self.V = len(words)
        self.encoder = LabelEncoder()
        
        # a,b are used to assign different values to alpha and beta
        self.alpha = np.ones(self.K) * a
        self.beta = np.ones(self.V) * b
        
        self.vocabulary = words
        self.documents =  self.preprocess(data, words)

    def compileModel(self):
    
        # Create the probabilistic variables
        self.prior_phi = pm.Container([Dir(f'prior_phi_{k}', self.beta) for k in range(self.K)])
        self.phi = pm.Container([CompDir(f'phi_{k}', self.prior_phi[k]) for k in range(self.K)])

        self.prior_theta = pm.Container([Dir(f'prior_theta_{m}', self.alpha) for m in range(self.M)])
        self.theta = pm.Container([CompDir(f'theta_{m}', self.prior_theta[m]) for m in range(self.M)])

        self.z = pm.Container([Cat(f"z_{m}",
                                p=self.theta[m],
                                size=self.N[m],
                                value=np.random.randint(self.K, size=self.N[m])) for m in range(self.M)])

        self.w = pm.Container([Cat(f"w_{m}_{n}",
                                p=pm.Lambda(f"phi_{m}_{n}", lambda z=self.z[m][n],phi=self.phi:phi[z]),
                                value=self.documents[m][n],
                                observed=True,
                                verbose=False) for m in range(self.M) for n in range(self.N[m])])

        # create the model
        self.model = pm.Model([self.prior_phi, self.prior_theta, self.phi, self.theta, self.z, self.w])
        self.mcmc = pm.MCMC(self.model)
        self.mcmc.sample(self.iter,self.burnin, thin=1)
     

    def preprocess(self, data, words):
        # encode the data
        self.encoder = LabelEncoder().fit(words)
        documents = []
        for document in data:
                documents.append(self.encoder.transform(document).tolist())
        return documents
    
    def __del__(self):
        pass

    def showResults(self):
        # print the results of theta, phi and z
        theta_values = [self.mcmc.trace(f"theta_{m}")[:].mean(axis=0) for m in range(self.M)]
        phi_values = [self.mcmc.trace(f"phi_{k}")[:].mean(axis=0) for k in range(self.K)]
        z_values = [np.round(self.mcmc.trace(f"z_{m}")[:].mean(axis=0)) for m in range(self.M)]
        
        print(f"Theta is\n")
        for thetas in theta_values:
            print(thetas)
        print(f"Phi is \n")
        for phis in phi_values:
            print(phis)
        print(f"Z is \n")
        for zs in z_values:
            print(zs)

    
    def mostImportantNWords(self, n=5):
        # print the most important words
        phi_values = [self.mcmc.trace(f"phi_{k}")[:].mean(axis=0) for k in range(self.K)]
        for i, values in enumerate(phi_values):
            print(f"Topic {i}")
            ids  = np.argsort(phi_values[i][0])[::-1]
            ids = ids[:n]
            restult = ""
            for id_ in ids:
                restult +=   str(self.vocabulary[id_]) + ", "
            print(restult)
            
    ## Task 2
    def documentSimilarity(self, threshold):
        # compare paris of documents and compute the hellinger distance
        theta_values = [self.mcmc.trace(f"theta_{m}")[:].mean(axis=0) for m in range(self.M)]
        similarities = []
        for index_doc1, theta1 in enumerate(theta_values):
            for index_doc2, theta2 in enumerate(theta_values):
                if index_doc1 != index_doc2:
                    similarities.append([index_doc1, index_doc2, 1 - self.hellingerDistance(theta1, theta2)])

        similarities = [similarity for similarity in similarities if similarity[2] > threshold]
        
        return similarities

    @staticmethod
    def hellingerDistance(document1, document2):
        score = 0
        for index in range(len(document1[0])):
            score = score + pow((math.sqrt(document1[0][index]) - (math.sqrt(document2[0][index]))), 2)

        return round(score,4)

                
    def assignNewTopic(self, document):
        # assign to a new document a topic
        seen_words = [x for x in document if x in self.vocabulary]
        encoded = self.encoder.transform(seen_words)
        phi_values = [self.mcmc.trace(f"phi_{k}")[:].mean(axis=0) for k in range(self.K)]
        probs = []
        for topic in range(self.K):
            topic_prob = 0
            for index in encoded:
                topic_prob += phi_values[topic][0][index]
            probs.append(topic_prob)
        
        max_index = np.argsort(probs)[-1]
        print("The topic of the document is ",max_index)

## Sanity check 1

In [4]:
ex1 = [["aaa", "bbb", "aaa"],
       ["bbb", "aaa", "bbb"],
        ["aaa", "bbb", "bbb", "aaa"],
        ["uuu", "vvv"],
        ["uuu", "vvv","vvv"],
        ["uuu", "vvv", "vvv", "uuu"]]


test1 = LDA(data=ex1, nr_of_topics=2 , a=0.75, b=0.75, iterations=1000)
test1.compileModel()



 [-----------------100%-----------------] 1000 of 1000 complete in 1.3 sec

In [5]:
test1.mostImportantNWords(2)

Topic 0
bbb, aaa, 
Topic 1
vvv, uuu, 


In [6]:
test1.showResults()
test1.documentSimilarity(0.99)

Theta is

[[0.74704154 0.25295846]]
[[0.78559115 0.21440885]]
[[0.68558918 0.31441082]]
[[0.25369908 0.74630092]]
[[0.03471068 0.96528932]]
[[0.40922949 0.59077051]]
Phi is 

[[0.31595751 0.47970243 0.08612931 0.11821074]]
[[0.21522033 0.00903309 0.30815887 0.46758771]]
Z is 

[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0. 0.]
[1. 1.]
[1. 1. 1.]
[1. 1. 1. 1.]


[[0, 1, 0.9979], [0, 2, 0.9953], [1, 0, 0.9979], [2, 0, 0.9953]]

In [8]:
test_Text = ["uuu", "uuu", "uuuuu", "CCC"]
test1.assignNewTopic(test_Text)

The topic of the document is  1


## Topic Modeling

In order to use a real dataset of text, it needs to be preprocessed in order to remove punctuations, common words and transform the nouns into singular form.

In [7]:
my_stop_words = STOPWORDS.union(set(['say']))

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def process_text(text, action="lemmatization_and_stemming"):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in my_stop_words and len(token) > 3:
            processed_token = lemmatize(token)
            if processed_token not in my_stop_words:
                result.append(processed_token)
    return result

In [8]:
data = pd.read_csv("./bbc-text.csv")
data.head(20)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...


### For the dataset, the BBC News dataset will be used. It contains news text from 5 categories.


The dataset is taken from [Kaggle](https://www.kaggle.com/balatmak/newsgroup20bbcnews)

In [1]:
categories = data.category.unique()
SAMPLES = 4
LENGTH = 30
input_data = []
# Sample from the big dataset, only the first SAMPLE lines from each category
# I'm making sure that the dataset is balanced, equal number of samples from each category
for cat in categories:

    category_df = data[data.category == cat].head(SAMPLES)
    for i, text in enumerate(category_df.text):
        np.random.randint(0,i+6)
        sentence = process_text(text)
        input_data.append(sentence[:LENGTH+i])


lda_model = LDA(data=input_data, nr_of_topics=len(categories),iterations=40000, a=0.75,b=0.75)

NameError: name 'data' is not defined

In [95]:
lda_model.compileModel()

 [-                 2%                  ] 1082 of 40000 complete in 8.7 secHalting at iteration  1103  of  40000


In [96]:
lda_model.showResults()

Theta is

[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
[[nan nan nan nan nan]]
Phi is 

[[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan

  ret, rcount, out=ret, casting='unsafe', subok=False)


In [97]:
lda_model.documentSimilarity(threshold= 0.9)



[]

In [98]:
lda_model.mostImportantNWords(n=10)

Topic 0
alzheimer, victims, december, crude, price, cold, weather, unite, state, europe, 
Topic 1
alzheimer, victims, december, crude, price, cold, weather, unite, state, europe, 
Topic 2
alzheimer, victims, december, crude, price, cold, weather, unite, state, europe, 
Topic 3
alzheimer, victims, december, crude, price, cold, weather, unite, state, europe, 
Topic 4
alzheimer, victims, december, crude, price, cold, weather, unite, state, europe, 




In [99]:
# Generate a random text from the database
categories = data.category.unique()
random_col = np.random.randint(len(data.index))
test_Text = process_text(data.iloc[random_col][1])
real_category = data.iloc[random_col][0]
print(test_Text)


['clarke', 'press', 'card', 'home', 'secretary', 'charles', 'clarke', 'vow', 'plough', 'plan', 'card', 'despite', 'pause', 'think', 'charles', 'kennedy', 'leader', 'david', 'blunkett', 'resignation', 'good', 'opportunity', 'question', 'legislation', 'necessary', 'clarke', 'support', 'plan', 'blunkett', 'argue', 'cabinet', 'support', 'card', 'mean', 'create', 'secure', 'society', 'clarke', 'acknowledge', 'measure', 'introduce', 'remain', 'matter', 'debate', 'legislation', 'significantly', 'influence', 'recommendations', 'commons', 'home', 'affairs', 'committee', 'issue', 'debate', 'parliament', 'monday', 'schedule', 'insist', 'earlier', 'kennedy', 'party', 'oppose', 'card', 'plan', 'deeply', 'flaw', 'christmas', 'come', 'home', 'secretary', 'time', 'think', 'tell', 'radio', 'today', 'programme', 'clarke', 'report', 'enthusiastic', 'card', 'predecessors', 'wouldn', 'good', 'opportunity', 'home', 'secretary', 'broom', 'sweep', 'clean', 'respect', 'need', 'legislation', 'place', 'ask', 'ea

In [100]:
lda_model.assignNewTopic(test_Text)

The topic of the document is  4




## Task 2 


### Topic based similarity

The similarity is calculated using the [helinger distance](https://en.wikipedia.org/wiki/Hellinger_distance) among each document. In this case I have managed to compare all pairs of documents and if the similiarity is above a certain treshold, it is printed.

### Assigning new topics

In order to assign a new topic to a document, the document is split into individual words, then each word is encoded (previously unseen words are skipped). Then for each topic we calculate the sum of probabilities for each word. To get the topic we return the index of the highest sum of probabilities. 