In [1]:
from __future__ import absolute_import, division, unicode_literals  # noqa
import logging
import sys
import os
from datetime import datetime
import numpy as np
from user import user
from Model import Model
import Stopwords
 
    
class TwitterLDAmain:
    
    def __init__(self, mon, data_dir = "/data"):
        self.base = os.getcwd() + data_dir
        self.name = "test"
        self.mon = mon
        self.filelist = self.base + "/files/" + self.mon + "_files.txt"
        self.dataDir = self.base + "/Data4Model/" + self.name + "/"
        self.dataDir = self.base + "/Data4Model/" + self.name + "/"+ self.mon +"/"
        self.outputDir = self.base + "/ModelRes/" + self.name + "/"+ self.mon +"/"
        self.modelParas = self.base + "/modelParameters-" + self.name + ".txt"
        self.stopfile = self.base + "/stoplist.txt"
        self.modelSettings = {}


    def getModelPara(self, modelParas, modelSettings): 
        
        modelSettings['topics'] = 40
        modelSettings['alpha_g'] = 1.25
        modelSettings["beta_word"] = 0.01
        modelSettings["beta_b"] = 0.01
        modelSettings["gamma"] = 20
        modelSettings["iteration"] = 20                     

        with open(modelParas, 'r', encoding='utf-8') as f:
            inputlines = f.read().splitlines()

        for item in inputlines:
            x = item.split(":")
            x = [x[0].strip(), x[1].strip()]
            if(x[1] and (x[0] in modelSettings)):
                modelSettings[x[0]] = float(x[1])
                
                
    def errprint(self, *args, **kwargs):
        print(*args, file=sys.stderr, **kwargs)
        
                
    def main(self):
        
        # 1. get model parameters
        
        self.getModelPara(self.modelParas, self.modelSettings)
        A_all = self.modelSettings['topics']
        alpha_g = self.modelSettings['alpha_g']
        beta_word = self.modelSettings['beta_word']
        beta_b = self.modelSettings['beta_b']
        gamma = self.modelSettings['gamma']
        nIter = self.modelSettings['iteration']
        
        self.errprint("Topics:" + str(A_all) + ", alpha_g:" + str(alpha_g) + ", beta_word:" + str(beta_word) + ", beta_b:" + str(beta_b) + ", gamma:" + str(gamma) + ", iteration:" + str(nIter))
        self.modelSettings.clear()
        
        
        Stopwords.Stopwords()
        Stopwords.addStopfile(self.stopfile)
        sw = Stopwords.stopwords_list
        
        outputTopicwordCnt = 30
        outputBackgroundwordCnt = 50

        outputWordsInTopics = self.outputDir + "WordsInTopics.txt"
        outputBackgroundWordsDistribution = self.outputDir + "BackgroundWordsDistribution.txt"
        outputTextWithLabel = self.outputDir + "/TextWithLabel/"

#         outputTextWithLabelfile = open(outputTextWithLabel, 'w', encoding='utf-8')
        
        if not os.path.exists(outputTextWithLabel):
            os.makedirs(outputTextWithLabel)
        
        # 2. get documents (users)
        # HashMap<String, Integer> wordMap = new HashMap<String, Integer>();
        # ArrayList<user> users = new ArrayList<user>();
        # ArrayList<String> uniWordMap = new ArrayList<String>();
        
        wordMap = {}
        uniWordMap = []
        users = []

        with open(self.filelist, 'r', encoding='utf-8') as f:
            files = f.read().splitlines()
            
        
            
        for file in files:
            tweetuser = user(self.dataDir + file, file, wordMap, uniWordMap)
            tweetuser.user()
            wordMap = tweetuser.wordMap
            uniWordMap = tweetuser.uniWordMap
            users.append(tweetuser)
            
        if (len(uniWordMap) != len(wordMap)):
            print(len(wordMap))
            print(len(uniWordMap))
            self.errprint("uniqword size is not the same as the hashmap size!")
            sys.exit(0)
            
        # output wordMap and itemMap
        with open(self.outputDir + "wordMap.txt", 'w', encoding='utf-8') as f:
            for k, v in wordMap.items():
                f.write(str(k) + '\t'+ str(v) + '\n')
                
        with open(self.outputDir + "uniWordMap.txt", 'w', encoding='utf-8') as f:
            for k in uniWordMap:
                f.write(str(k)+ '\n')
                
        uniWordMapSize = len(uniWordMap)
        wordMap.clear()
        uniWordMap.clear()

        # 3. run the model
        model = Model(A_all, len(users), uniWordMapSize, nIter, alpha_g, beta_word, beta_b, gamma)
        model.initialize(users)
        model.estimate(users, nIter)
        
        # 4. output model results
        print("Record Topic Distributions/Counts")
        model.outputTopicDistributionOnUsers(self.outputDir, users)
        print("read uniwordmap")
        
        with open(self.outputDir + "uniWordMap.txt", 'r', encoding='utf-8') as f:
            uniWordMap = f.read().splitlines()
        
        try:
            model.outputTextWithLabel(outputTextWithLabel, users, uniWordMap)
        except:
            print("An exception occurred in outputTextWithLabel: ", sys.exc_info()[0])
            
        print("write text with labels done")
        # model.outputTopicCountOnTime(outputTopicsCountOnTime)
        users.clear()

        try:
            model.outputWordsInTopics(outputWordsInTopics, uniWordMap, outputTopicwordCnt)
        except:
            print("An exception occurred in outputWordsInTopics: ", sys.exc_info()[0])

        print("write topics with keywords done")
        
        
        try:
            model.outputBackgroundWordsDistribution(outputBackgroundWordsDistribution, uniWordMap, outputBackgroundwordCnt)
        except:
            print("An exception occurred in outputBackgroundWordsDistribution: ", sys.exc_info()[0])
            
        print("Record Background done")
        print("Final Done")
        
        
        

In [2]:
twitterLDA = TwitterLDAmain('Jul')
twitterLDA.main()

Topics:20.0, alpha_g:0.5, beta_word:0.01, beta_b:0.01, gamma:20.0, iteration:10.0


20113
initializing...
Intialize Done
iteration 1 ...
iteration 2 ...
iteration 3 ...
iteration 4 ...
iteration 5 ...
iteration 6 ...
iteration 7 ...
iteration 8 ...
iteration 9 ...
iteration 10 ...
Record Topic Distributions/Counts
read uniwordmap
write text with labels done
write topics with keywords done
Record Background done
Final Done


In [27]:
base = os.getcwd() + "/data"
name = "test"
modelParas = base + "/modelParameters-" + name + ".txt"
modelSettings = {}
getModelPara(modelParas, modelSettings)
modelSettings  

{'topics': 40, 'alpha_g': 1.25, 'beta_word': 0.01, 'beta_b': 0.01, 'gamma': 20, 'iteration': 20}


{'topics': 20.0,
 'alpha_g': 0.5,
 'beta_word': 0.01,
 'beta_b': 0.01,
 'gamma': 20.0,
 'iteration': 100.0}

In [44]:
model = Model(5, 7, 6, 3, 2.3, 0.05, 0.01, 0.47)
model.test()

[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]


In [72]:
test = []

AA = [1,5,6,7,8]

rn = 5

for i in range(rn):
    test.append(AA[i])
    print(test[i])
    
theta_general = [0]*5 #[ [ 0 for i in range(5) ] for j in range(4) ]
# x = theta_general[0][2]
theta_general[2] += 2
theta_general

1
5
6
7
8


[0, 0, 2, 0, 0]

In [74]:
if (2 > 1):
    buff = True
else:
    buff = False

if (buff):
    print("yaya")
    
for i in range(2):
    print(i)

yaya
0
1


In [3]:
C_word = [ [ 0 for i in range(4) ] for j in range(5) ]
print(C_word)

[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]


In [85]:
sys.float_info.min

2.2250738585072014e-308