# IMPORTS

In [1]:
# all the relevant imports are done here
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys
basedir = 'C:\\Users\\rpetr\\OneDrive\\Desktop\\DISS_CODE\\ms2ldaviz\\ms2ldaviz'
sys.path.append(basedir)
import django
import json
django.setup()
from basicviz.models import Experiment, Alpha, Mass2MotifInstance, FeatureInstance, Feature, Document, Mass2Motif, DocumentMass2Motif, FeatureMass2MotifInstance
import numpy as np
import pylab as plt
import csv
from scipy.special import polygamma as pg
from scipy.special import psi as psi

MEDIA_ROOT is C:\Users\rpetr\OneDrive\Desktop\DISS_CODE\ms2ldaviz\ms2ldaviz\media


# VARIABLES

###### Choose an experiment. In this case it is experiment 190. It has 500 topics, 27923 words and 2132 unique docs. These were tested against the database using appropriate queries. 

In [2]:
experiment_id=190 
experiment = Experiment.objects.get(id=experiment_id)
min_prob_beta = 1e-3
SMALL_NUMBER = 1e-100
eta = 0.1 #needed for beta m-step

# CORPUS (features for 1 document in experiment 190)

###### First we get all features in the database for our experiment. 

In [3]:
# Get all features in the database relevant for our experiment. 
features = Feature.objects.filter(experiment_id=experiment)
experiment_words = []
for f in features:
     if f.id not in experiment_words: 
        experiment_words.append(f.id)

In [14]:
# Unique words lists all the features as {feature_id:incremental_id} word value pairs. 
unique_words = {}
index = 0
for word in experiment_words:
    if word not in unique_words.keys():
        unique_words.update({word:index})
        index+=1

###### Then we get a random document for our experiment from the database. 

In [16]:
#We will use a single document for our experiment. Modify here if more documents are needed. 
experiment_docs=[269323]

In [17]:
#unique documents is the dictionary -> doc: id 
unique_docs = {}
index = 0 
for doc in experiment_docs: 
    unique_docs.update({doc:index})
    index+=1

###### Get the features only for the specific documents and create the corpus dictionary {DOC:{WORD:COUNT}}. 

In [18]:
# Get features for all documents chosen. The output columns are doc_id, word_id and intensity.
feature_instances = FeatureInstance.objects.filter(document_id__in=unique_docs.keys(), feature_id__in=unique_words.keys())
doc_word_data = []
for f in feature_instances:
    doc_word_data.append([unique_docs[int(f.document_id)], unique_words[int(f.feature_id)], f.intensity])

In [19]:
# Output a csv for the corpus in order to create a dictionary made up of {document_id:{word_id:intensity}}. 
# Intensity in this case is an integer (count).
doc_word_array = np.array(doc_word_data)
np.savetxt("corpus_data.csv", doc_word_array, delimiter=",", fmt="%s")
np.save("corpus_data",doc_word_array)

In [25]:
#CREATE THE CORPUS - a dictionary where key is document id and value is a dict of the count of words
corpus_dict = {}
with open("corpus_data.csv", 'r') as data_file:
    data = csv.DictReader(data_file, delimiter=",")
    for row in data:
        item = corpus_dict.get(row["doc_id"], dict())
        item[row["word_id"]] = int(row["count"])
        corpus_dict[row["doc_id"]] = item
#Get the corpus dict whenever this is necessary

# UNIQUE TOPICS

In [31]:
# Get the 500 unique topics for the experiment. 
mi = Mass2Motif.objects.filter(experiment=experiment)
unique_topics = {}
index=0
for m in mi: 
    unique_topics.update({m.id:index})
    index+=1

# ALPHA

In [35]:
# get the alphas from the database (it is a vector)
al = Alpha.objects.filter(mass2motif__experiment=experiment).order_by('mass2motif')
alphas = {}
for a in al:
    alphas.update({unique_topics[a.mass2motif_id]: a.value})

In [42]:
alpha_list = alphas.values()

In [44]:
alpha_vector = np.array(alpha_list)

In [45]:
# save to text if necessary 
# np.savetxt("alpha.csv", alpha_vector, delimiter=",", fmt="%s")

# BETA

In [94]:
# Get beta from the database (it is a topic * words 2d matrix - each cell is a probability)
beta_pre_pivot = []
mi = Mass2MotifInstance.objects.filter(mass2motif__experiment=experiment)
for m in mi:
    beta_pre_pivot.append([unique_topics[m.mass2motif_id], unique_words[m.feature_id], m.probability]) 

In [95]:
# Some topics may have 0 words - these have been reincluded 
# Creating array from the beta data and subsequently creating a pivot (matrix)
output_arr_beta = np.array(beta_pre_pivot)
K = len(unique_topics)
W = len(unique_words)
pivot_table = np.zeros((K, W)).astype('float')
i = 0
max = len(beta_pre_pivot)
while i<max:
    pivot_table[int(output_arr_beta[i][0]),int(output_arr_beta[i][1])]=output_arr_beta[i][2]
    i+=1

In [96]:
# Use this to get beta csv. 
# np.savetxt("beta.csv", pivot_table, delimiter=",")

In [97]:
# Normalise the beta pivot table matrix. 
pivot_table_normalised = pivot_table
i = 0
while i<500: 
    row = pivot_table_normalised[i, :]
    adjusted_row = row + 1e-8
    normalised_row = adjusted_row / np.sum(adjusted_row)
    np.sum(normalised_row)
    pivot_table_normalised[i, :] = normalised_row
    i+=1

In [98]:
# Use this to output a csv for the beta matrix if needed. 
# np.savetxt("beta_matrix.csv", pivot_table_normalised, delimiter=",")

# VISUALISATION

In [100]:
# use for visualisation if necessary 
# my_dpi=150
# plt.figure(figsize=(2000/my_dpi, 2000/my_dpi), dpi=my_dpi)
# plt.imshow(pivot_table_normalised, aspect="auto")

# GET ORIGINAL THETA(NORM GAMMA)

In [101]:
# get the original theta from the database for subsequent comparison 
# remember theta is just normalised gamma and represents a docs * topics matrix 
theta = DocumentMass2Motif.objects.filter(document_id__in=experiment_docs)
output_data_theta = []
for t in theta:
    output_data_theta.append([unique_docs[int(t.document_id)], unique_topics[int(t.mass2motif_id)], t.probability])

In [103]:
output_data_theta

[[0, 94, 0.902611552679438],
 [0, 286, 0.0340671553770332],
 [0, 200, 0.0186782260549867],
 [0, 414, 0.0109110112103501],
 [0, 499, 0.0158815907979228]]

# GET ORIGINAL PHI 

In [104]:
# get the features related to the instances
feature_instance = FeatureInstance.objects.filter(document_id__in=experiment_docs)
feature_instance_join = {}
for i in feature_instance:
    feature_instance_join.update({int(i.id):[int(i.document_id), int(unique_words[i.feature_id])]})

In [105]:
# connect docs, topics and features into a list of lists
feature_m2m_instance = FeatureMass2MotifInstance.objects.filter(mass2motif__experiment=experiment)
phi_list = []
for i in feature_m2m_instance:
    if i.featureinstance_id in feature_instance_join.keys():
        phi_list.append([feature_instance_join[int(i.featureinstance_id)][0], unique_topics[int(i.mass2motif_id)], feature_instance_join[int(i.featureinstance_id)][1],i.probability])

In [107]:
# This gives the original phi, which in abstract terms is a 3D matrix -> docs * topics * words
phi_original = []
for line in phi_list: 
    phi_original.append([line[0],line[2],line[1],line[3]])
phi_original_array = np.array(phi_original)

In [108]:
# np.savetxt("phi_original_array.csv", phi_original_array, delimiter=",", fmt="%s")
# np.save("phi_original_array", phi_original_array)

# E-STEP (has 9 steps)

## Step 0 - E-step variables

In [110]:
# alpha_vector is already mentioned above
# beta_matrix is created here from pivot_table_normalised
# K and W are from above for total unique topics, total unique words respectively 
# you need a corpus (created above)
corpus = corpus_dict
beta_matrix = pivot_table_normalised

## Step 1 - Initialise phi matrix

In [132]:
# initialise the 3D matrix phi with zeroes
phi_matrix={}
for doc in corpus: 
    d = int(doc)
    phi_matrix[d] = {}
    for word in corpus[doc]:
        w = int(word)
        phi_matrix[d][w]=np.zeros(K)

## Step 2 - initialise gamma matrix

In [140]:
# create a gamma matrix with rows as documents and columns as topics 
# this will later be transposed in order to create the phi matrix in the steps 3-9 below
# doc_total is the count of words per doc, and each gamma is alpha plus that

gamma_matrix=np.zeros((int(len(corpus)),int(K))) #3x500 shape
for doc in corpus:
    doc_total=0.0
    for word in corpus[doc]:
        doc_total += corpus[doc][word]
    gamma_matrix[int(doc),:] = alpha_vector + 1.0*(doc_total/K)

In [141]:
gamma_matrix

array([[14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732,
        14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 14.732, 

## Step 3 - 9: repeat until convergence loop

In [155]:
# initialise phi and do Blei's loop 
n_words = int(len(unique_words))
temp_beta = np.zeros((K, n_words))
current_gamma = gamma_matrix
for i in range(10):   
    prev_gamma = current_gamma
    for doc in corpus:
        d = int(doc)
        doc_dict = corpus[doc]
        temp_gamma = np.zeros(K) + alpha_vector
        for word in doc_dict: #the word is actually column positioning so we do not need n^3 complexity 
            w = int(word)
            log_phi_matrix = np.log(beta_matrix[:,w]) + psi(gamma_matrix[d,:]).T
            log_phi_matrix = np.exp(log_phi_matrix - log_phi_matrix.max())
            phi_matrix[d][w] = log_phi_matrix/log_phi_matrix.sum()
            temp_gamma += phi_matrix[d][w]*corpus[doc][word]
            temp_beta[:,w] += phi_matrix[d][w] * corpus[doc][word]
        gamma_matrix[d,:] += temp_gamma[0,]
        pos = np.where(gamma_matrix[d,:]<SMALL_NUMBER)[0]
        gamma_matrix[d,pos] = SMALL_NUMBER
    current_gamma = gamma_matrix
    gamma_diff = ((current_gamma - prev_gamma)**2).sum()
#     beta_matrix = temp_beta
    print(i)
    print(gamma_matrix.sum())

0
55511.27194821689
1
56105.65802165169
2
56700.04409508646
3
57294.43016852122
4
57888.816241955996
5
58483.202315390794
6
59077.58838882556
7
59671.97446226033
8
60266.3605356951
9
60860.7466091299


In [148]:
lola = phi_matrix[0][10001]*corpus['0']['10001']
lola

array([1.19365850e-06, 7.15192127e-01, 1.27815026e-06, 1.18438250e-06,
       1.15722327e-06, 1.19413969e-06, 1.14302167e-06, 1.24730235e-06,
       1.20425179e-06, 1.29807873e-06, 1.25402051e-06, 1.22122806e-06,
       1.20743707e-06, 1.36273091e-06, 1.28684497e-06, 1.38617915e-06,
       1.21318732e-06, 1.17359787e-06, 1.62376000e-01, 1.22702012e-06,
       1.13064617e-06, 1.21655649e-06, 1.30498855e-06, 1.23680168e-06,
       1.65296936e-06, 3.92740357e-01, 1.62761222e-01, 1.17924176e-06,
       1.47230043e-06, 1.14763635e-06, 1.22349590e-06, 1.24557351e-06,
       1.27846424e-06, 5.96719681e-01, 1.23555066e-06, 1.21639975e-06,
       1.18197511e-06, 1.21753982e-06, 1.23233747e-06, 1.31651875e-06,
       1.41202588e-06, 1.15294789e-06, 1.15618268e-06, 1.21598682e-06,
       1.28912583e-06, 1.35785931e-06, 1.18804881e-06, 1.28041389e-06,
       1.18722998e-06, 1.29774883e-06, 1.31427024e-06, 2.18019747e+00,
       1.14315596e-06, 1.51250111e-06, 1.01820944e-06, 1.30542041e-06,
      

# COMPARISON (phi gamma original vs calculated) 

## Gamma comparison / actually Theta comparison 

## Phi comparison

In [None]:
len(unique_words)

In [None]:
#check phi_matrix vs phi_list 
#first bring to same format

phi_calculated = []
for line in phi_list: 
    phi_calculated.append([line[0],line[2],line[1],phi_matrix[line[0]][line[2]][line[1]]])

phi_calculated_array = np.array(phi_calculated)
np.savetxt("phi_calculated_array.csv", phi_calculated_array, delimiter=",", fmt="%s")          
np.save("phi_calculated_array", phi_calculated_array)       