In [1]:
import numpy as np
import codecs
import jieba
import re
import random
import math
from scipy.special import psi

In [2]:
import os

In [3]:
def get_data(DATA_DIR):
    subfolders = ['soccer','futbol','cricket','basketball']

    data = []
    #target = []
    
    #soccer
    soccer_files = os.listdir(os.path.join(DATA_DIR,'soccer'))
    for soccer_file in soccer_files:
        with open(os.path.join(DATA_DIR,'soccer',soccer_file), encoding="latin-1") as f:
            #print (f.read())
            data.append(f.read())
            #target.append(1)
            
    #futbol
    futbol_files = os.listdir(os.path.join(DATA_DIR,'futbol'))
    for futbol_file in futbol_files:
        with open(os.path.join(DATA_DIR,'futbol',futbol_file), encoding="latin-1") as f:
            #print (f.read())
            data.append(f.read())
            #target.append(2)
         
    #cricket
    cricket_files = os.listdir(os.path.join(DATA_DIR,'cricket'))
    for cricket_file in cricket_files:
        with open(os.path.join(DATA_DIR,'cricket',cricket_file), encoding="latin-1") as f:
            #print (f.read())
            data.append(f.read())
            #target.append(3)
         
    #basketball
    basketball_files = os.listdir(os.path.join(DATA_DIR,'basketball'))
    for basketball_file in basketball_files:
        with open(os.path.join(DATA_DIR,'basketball',basketball_file), encoding="latin-1") as f:
            #print (f.read())
            data.append(f.read())
            #target.append(4)
      
    return data


In [4]:
def get_inference_data(DATA_DIR,sports):
    subfolders = ['soccer','futbol','cricket','basketball']

    data = []
    #target = []
    
    files = os.listdir(os.path.join(DATA_DIR,sports))
    for file in files:
        with open(os.path.join(DATA_DIR,sports,file), encoding="latin-1") as f:
            data.append(f.read())
      
    return data


In [5]:
# itemIdList : the list of distinct terms in the document
# itemCountList : the list of number of the existence of corresponding terms
# wordCount : the number of total words (not terms)
class Document:
    def __init__(self, itemIdList, itemCountList, wordCount):
        self.itemIdList = itemIdList
        self.itemCountList = itemCountList
        self.wordCount = wordCount


In [6]:
# preprocessing (segmentation, stopwords filtering, represent documents as objects of class Document)
def preprocessing():
    
    # read the list of stopwords
    file = codecs.open('Desktop/lda/stopwords.dic','r','utf-8')
    stopwords = [line.strip() for line in file]
    file.close()
    
    # read the corpus for training
    #file = codecs.open('Desktop/lda/dataset.txt','r','utf-8')
    #documents = [document.strip() for document in file] 
    #file.close()
    documents = [document.strip() for document in get_data('Desktop/project/collection')]
    
    #print (len(documents))
    
    docs = []
    word2id = {}
    id2word = {}
    
    currentWordId = 0
    for document in documents:
        word2Count = {}
        # segmentation
        segList = jieba.cut(document)
        for word in segList: 
            word = word.lower().strip()
            # filter the stopwords
            if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords:
                if word not in word2id:
                    word2id[word] = currentWordId
                    id2word[currentWordId] = word
                    currentWordId += 1
                if word in word2Count:
                    word2Count[word] += 1
                else:
                    word2Count[word] = 1
        itemIdList = []
        itemCountList = []
        wordCount = 0

        for word in word2Count.keys():
            itemIdList.append(word2id[word])
            itemCountList.append(word2Count[word])
            wordCount += word2Count[word]

        docs.append(Document(itemIdList, itemCountList, wordCount))

    return docs, word2id, id2word

In [7]:
def maxItemNum():
    num = 0
    for d in range(0, N):
        if len(docs[d].itemIdList) > num:
            num = len(docs[d].itemIdList)
    return num

In [8]:
def initialLdaModel():
    for z in range(0, K):
        for w in range(0, M):
            nzw[z, w] += 1.0/M + random.random()
            nz[z] += nzw[z, w]
    updateVarphi()    


In [9]:
def updateVarphi():
    for z in range(0, K):
        for w in range(0, M):
            if(nzw[z, w] > 0):
                varphi[z, w] = math.log(nzw[z, w]) - math.log(nz[z])
            else:
                varphi[z, w] = -100

In [10]:
# update variational parameters : gamma and phi
def variationalInference(docs, d, gamma, phi):
    phisum = 0
    oldphi = np.zeros([K])
    digamma_gamma = np.zeros([K])
    
    for z in range(0, K):
        gamma[d][z] = alpha + docs[d].wordCount * 1.0 / K
        digamma_gamma[z] = psi(gamma[d][z])
        for w in range(0, len(docs[d].itemIdList)):
            phi[w, z] = 1.0 / K

    for iteration in range(0, iterInference):
        for w in range(0, len(docs[d].itemIdList)):
            phisum = 0
            for z in range(0, K):
                oldphi[z] = phi[w, z]
                phi[w, z] = digamma_gamma[z] + varphi[z, docs[d].itemIdList[w]]
                if z > 0:
                    phisum = math.log(math.exp(phisum) + math.exp(phi[w, z]))
                else:
                    phisum = phi[w, z]
            for z in range(0, K):
                phi[w, z] = math.exp(phi[w, z] - phisum)
                gamma[d][z] =  gamma[d][z] + docs[d].itemCountList[w] * (phi[w, z] - oldphi[z])
                digamma_gamma[z] = psi(gamma[d][z])



In [11]:
# calculate the gamma parameter of new document
def inferTopicOfNewDocument_futbol():
    testDocs = []
    # read the corpus to be inferred
    #file = codecs.open('Desktop/lda/infer.txt','r','utf-8')
    testDocuments = [document.strip() for document in get_inference_data('Desktop/project/testSet','futbol')]
    testDocuments.pop(2)
    print ("Number of test documents= ",len(testDocuments))
    print ("**")
    print (testDocuments[0])
    print ("*******")
    #file.close()
    print (testDocuments[1])
    print ("*******")
    print (testDocuments[2])
    print ("*******")
    print (testDocuments[3])
    print ("*******")
    print (testDocuments[4])
    print ("*******")
    print (testDocuments[5])
    print ("*******")
    print (testDocuments[6])
    print ("*******")
    print (testDocuments[7])
    print ("*******")
    print (testDocuments[8])
    print ("*******")
    print (testDocuments[9])
    print ("*******")
    
    for d in range(0, len(testDocuments)):
        document = testDocuments[d]
        word2Count = {}
        # segmentation
        segList = jieba.cut(document)
        for word in segList: 
            word = word.lower().strip()
            if word in word2id:
                if word in word2Count:
                    word2Count[word] += 1
                else:
                    word2Count[word] = 1
                      
        itemIdList = []
        itemCountList = []
        wordCount = 0

        for word in word2Count.keys():
            itemIdList.append(word2id[word])
            itemCountList.append(word2Count[word])
            wordCount += word2Count[word]

        testDocs.append(Document(itemIdList, itemCountList, wordCount))
    
    gamma = np.zeros([len(testDocuments), K])
    for d in range(0, len(testDocs)):
        phi = np.zeros([len(testDocs[d].itemIdList), K])
        variationalInference(testDocs, d, gamma, phi)
        
    return gamma

In [12]:
# calculate the gamma parameter of new document
def inferTopicOfNewDocument_cricket():
    testDocs = []
    # read the corpus to be inferred
    #file = codecs.open('Desktop/lda/infer.txt','r','utf-8')
    testDocuments = [document.strip() for document in get_inference_data('Desktop/project/testSet','cricket')]
    testDocuments.pop(2)
    print ("Number of test documents= ",len(testDocuments))
    print ("**")
    print (testDocuments[0])
    print ("*******")
    #file.close()
    print (testDocuments[1])
    print ("*******")
    print (testDocuments[2])
    print ("*******")
    print (testDocuments[3])
    print ("*******")
    print (testDocuments[4])
    print ("*******")
    print (testDocuments[5])
    print ("*******")
    print (testDocuments[6])
    print ("*******")
    print (testDocuments[7])
    print ("*******")
    print (testDocuments[8])
    print ("*******")
    print (testDocuments[9])
    print ("*******")
    
    for d in range(0, len(testDocuments)):
        document = testDocuments[d]
        word2Count = {}
        # segmentation
        segList = jieba.cut(document)
        for word in segList: 
            word = word.lower().strip()
            if word in word2id:
                if word in word2Count:
                    word2Count[word] += 1
                else:
                    word2Count[word] = 1
                      
        itemIdList = []
        itemCountList = []
        wordCount = 0

        for word in word2Count.keys():
            itemIdList.append(word2id[word])
            itemCountList.append(word2Count[word])
            wordCount += word2Count[word]

        testDocs.append(Document(itemIdList, itemCountList, wordCount))
    
    gamma = np.zeros([len(testDocuments), K])
    for d in range(0, len(testDocs)):
        phi = np.zeros([len(testDocs[d].itemIdList), K])
        variationalInference(testDocs, d, gamma, phi)
        
    return gamma

In [13]:
# calculate the gamma parameter of new document
def inferTopicOfNewDocument_basketball():
    testDocs = []
    # read the corpus to be inferred
    #file = codecs.open('Desktop/lda/infer.txt','r','utf-8')
    testDocuments = [document.strip() for document in get_inference_data('Desktop/project/testSet','basketball')]
    testDocuments.pop(0)
    print ("Number of test documents= ",len(testDocuments))
    print ("**")
    print (testDocuments[0])
    print ("*******")
    #file.close()
    print (testDocuments[1])
    print ("*******")
    print (testDocuments[2])
    print ("*******")
    print (testDocuments[3])
    print ("*******")
    print (testDocuments[4])
    print ("*******")
    print (testDocuments[5])
    print ("*******")
    print (testDocuments[6])
    print ("*******")
    print (testDocuments[7])
    print ("*******")
    print (testDocuments[8])
    print ("*******")
    print (testDocuments[9])
    print ("*******")
    
    for d in range(0, len(testDocuments)):
        document = testDocuments[d]
        word2Count = {}
        # segmentation
        segList = jieba.cut(document)
        for word in segList: 
            word = word.lower().strip()
            if word in word2id:
                if word in word2Count:
                    word2Count[word] += 1
                else:
                    word2Count[word] = 1
                      
        itemIdList = []
        itemCountList = []
        wordCount = 0

        for word in word2Count.keys():
            itemIdList.append(word2id[word])
            itemCountList.append(word2Count[word])
            wordCount += word2Count[word]

        testDocs.append(Document(itemIdList, itemCountList, wordCount))
    
    gamma = np.zeros([len(testDocuments), K])
    for d in range(0, len(testDocs)):
        phi = np.zeros([len(testDocs[d].itemIdList), K])
        variationalInference(testDocs, d, gamma, phi)
        
    return gamma

In [14]:
# calculate the gamma parameter of new document
def inferTopicOfNewDocument_soccer():
    testDocs = []
    # read the corpus to be inferred
    #file = codecs.open('Desktop/lda/infer.txt','r','utf-8')
    testDocuments = [document.strip() for document in get_inference_data('Desktop/project/testSet','soccer')]
    #testDocuments.pop(2)
    print ("Number of test documents= ",len(testDocuments))
    print ("**")
    print (testDocuments[0])
    print ("*******")
    #file.close()
    print (testDocuments[1])
    print ("*******")
    print (testDocuments[2])
    print ("*******")
    print (testDocuments[3])
    print ("*******")
    print (testDocuments[4])
    print ("*******")
    print (testDocuments[5])
    print ("*******")
    print (testDocuments[6])
    print ("*******")
    print (testDocuments[7])
    print ("*******")
    print (testDocuments[8])
    print ("*******")
    print (testDocuments[9])
    print ("*******")
    
    for d in range(0, len(testDocuments)):
        document = testDocuments[d]
        word2Count = {}
        # segmentation
        segList = jieba.cut(document)
        for word in segList: 
            word = word.lower().strip()
            if word in word2id:
                if word in word2Count:
                    word2Count[word] += 1
                else:
                    word2Count[word] = 1
                      
        itemIdList = []
        itemCountList = []
        wordCount = 0

        for word in word2Count.keys():
            itemIdList.append(word2id[word])
            itemCountList.append(word2Count[word])
            wordCount += word2Count[word]

        testDocs.append(Document(itemIdList, itemCountList, wordCount))
    
    gamma = np.zeros([len(testDocuments), K])
    for d in range(0, len(testDocs)):
        phi = np.zeros([len(testDocs[d].itemIdList), K])
        variationalInference(testDocs, d, gamma, phi)
        
    return gamma

In [15]:
docs, word2id, id2word = preprocessing() 

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/f7/1z1tyggs5099pqk_cf32v4fh0000gn/T/jieba.cache
Loading model cost 1.249 seconds.
Prefix dict has been built succesfully.


In [16]:
# number of documents for training
N = len(docs)
# number of distinct terms
M = len(word2id)
# number of topic
K = 4
# iteration times of variational inference, judgment of the convergence by calculating likelihood is ommited
iterInference = 30 
# iteration times of variational EM algorithm, judgment of the convergence by calculating likelihood is ommited
iterEM = 80

In [17]:
# initial value of hyperparameter alpha
alpha = 1
# sufficient statistic of alpha
alphaSS = 0
# the topic-word distribution (beta in D. Blei's paper)
varphi = np.zeros([K, M])
# topic-word count, this is a sufficient statistic to calculate varphi
nzw = np.zeros([K, M])
# topic count, sum of nzw with w ranging from [0, M-1], for calculating varphi
nz = np.zeros([K])


In [18]:
# inference parameter gamma
gamma = np.zeros([N, K])
# inference parameter phi
phi = np.zeros([maxItemNum(), K])


In [19]:
# initialization of the model parameter varphi, the update of alpha is ommited
initialLdaModel()


In [20]:
# variational EM Algorithm
for iteration in range(0, iterEM): 
    nz = np.zeros([K])
    nzw = np.zeros([K, M])
    alphaSS = 0
    # E-Step
    print ("E-Step started...")
    for d in range(0, N):
        variationalInference(docs, d, gamma, phi)
        gammaSum = 0
        for z in range(0, K):
            gammaSum += gamma[d, z]
            alphaSS += psi(gamma[d, z])
        alphaSS -= K * psi(gammaSum)

        for w in range(0, len(docs[d].itemIdList)):
            for z in range(0, K):
                nzw[z][docs[d].itemIdList[w]] += docs[d].itemCountList[w] * phi[w, z]
                nz[z] += docs[d].itemCountList[w] * phi[w, z]

    # M-Step
    print ("M-Step started...")
    updateVarphi()
print ("EM Algorithm Ends Here.....")

E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step started...
E-Step started...
M-Step sta

In [21]:
# calculate the top 10 terms of each topic
topicwords = []
maxTopicWordsNum = 10
for z in range(0, K):
    ids = varphi[z, :].argsort()
    topicword = []
    for j in ids:
        topicword.insert(0, id2word[j])
    topicwords.append(topicword[0 : min(10, len(topicword))])


In [22]:
print (topicwords)

[['game', 'season', 'games', 'scored', 'rebounds', 'quarter', 'time', 'night', 'team', 'friday'], ['league', 'team', 'season', 'goal', 'match', 'win', 'players', 'games', 'goals', 'barcelona'], ['test', 'game', 'cricket', 'bangladesh', 'ball', 'wickets', 'day', 'play', 'series', 'team'], ['game', 'week', 'yards', 'nfl', 'season', 'games', 'injury', 'win', 'team', 'play']]


In [23]:
# infer the topic of each new document
inferGamma_soccer = inferTopicOfNewDocument_soccer()
inferZ_soccer = []
for i in range(0, len(inferGamma_soccer)):
    inferZ_soccer.append(inferGamma_soccer[i, :].argmax())
    

Number of test documents=  10
**
Fresh off a 3-0 loss to England, the United States men's national team hit Belgium on Tuesday for its final soccer game of 2018, a showdown with Italy.

Anyone tuning in for USA's international friendly (you can stream the match on fuboTV) got themselves dinner with their show, however, thanks to a specific striker playing for Roberto Mancini's squad.

It turns out that a Udinese veteran was called up to Italy's national team after an injury in October. He made his debut in a UEFA Nations League match. He took the field against the Americans on Tuesday. And his name is Kevin Lasagna.

Lasagna's story is rather profound considering he ascended from amateur football to international competition in a matter of four years, but it's his name that made waves upon Italy's clash with USA. And who can blame the people responsible for that? I mean, how often have you ever known a Kevin Lasagna? Imagine the family this guy could have -- Larry Lasagna, Lindsey Lasa

In [24]:
print (inferGamma_soccer)

[[ 14.72223279  40.63560485   9.23884537   2.403317  ]
 [  1.66735691  65.43168964   1.34786984   1.55308361]
 [  6.02582198  58.7293827    1.24809469   3.99670063]
 [ 17.44518502  45.27656998   1.99637348   7.28187152]
 [  6.70779645  27.06474709   1.44196899  11.78548747]
 [  4.92725838  58.58590096   7.9725789   11.51426176]
 [ 32.78118282 155.81875318  20.5554241   47.8446399 ]
 [  1.8165069   22.59398241   3.00583524   8.58367544]
 [  4.20290697 190.21253084  11.63436086  17.95020133]
 [  1.75899287  78.17202461  16.11492487   5.95405765]]


In [25]:
print (inferZ_soccer)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [26]:
# infer the topic of each new document
inferGamma_futbol = inferTopicOfNewDocument_futbol()
inferZ_futbol = []
for i in range(0, len(inferGamma_futbol)):
    inferZ_futbol.append(inferGamma_futbol[i, :].argmax())
    

Number of test documents=  10
**
Baker Mayfield and the Browns scored a win Sunday in a matchup you don't see too often in pro football.

Bengals-Browns is a usual meeting, but Bengals-Browns featuring former Cleveland head coach Hue Jackson now serving on the Cincinnati staff is rare. It provided Jackson with an opportunity to defeat the franchise that booted him last month, or the Browns a chance to take down their old coach.


The latter happened, and it came with a bit of stinginess from Mayfield, who tossed four touchdowns and shook hands with Jackson after the game but appeared to resist the embrace Jackson reached for when he patted the back of Mayfield's head. It looked stiff, if not downright awkward. Mayfield's postgame words shed some light on the body language.

"Left Cleveland, goes down to Cincinnati," Mayfield said when asked about his feelings about his former coach. "I don't know. That's just somebody that's in our locker room asking for us to play for him and then goe

In [27]:
print (inferGamma_futbol)

[[  4.31462122  17.59714866  17.74029088  93.34793924]
 [  5.83839241   3.22944212   1.14848474  28.78368073]
 [ 20.37171269   1.26403052   4.81702225  26.54723454]
 [ 12.03526811   3.04533009   8.82856912  71.09083268]
 [ 22.44532194   5.56755376   7.96482791  40.02229639]
 [ 52.52303284   1.80285736   3.63862371  53.0354861 ]
 [ 76.97823803   9.48213709  27.61694426  77.92268062]
 [  3.24590531   5.98012137   5.2097013   36.56427203]
 [ 31.20858511   7.45769107  16.66868783 105.665036  ]
 [  5.02862095   2.72693569   6.26303037  75.98141299]]


In [28]:
print (inferZ_futbol)

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


In [29]:
# infer the topic of each new document
inferGamma_basketball = inferTopicOfNewDocument_basketball()
inferZ_basketball = []
for i in range(0, len(inferGamma_basketball)):
    inferZ_basketball.append(inferGamma_basketball[i, :].argmax())
   

Number of test documents=  10
**
As a rookie in 1992-93, Shaquille O'Neal proved he belonged in the NBA by winning Rookie of the Year honors, making the All-Star team and dominating in the paint regularly.

Once 1993-94 got started, O'Neal didn't waste time picking up where he left off. On Nov. 20, 1993, Shaquille OÕNeal dominated the New Jersey Nets for 24 points, 28 rebounds and 15 blocks in the Orlando Magic's 87-85 win. That game was the only 20-point, 20-rebound, 10-block game in O'Neal's legendary career.

At only 21 years old, this game helped solidify ShaqÕs place as one of the most dominant players in the league. That season, O'Neal averaged 29.3 points, 13.2 rebounds and 2.9 blocks per game while leading the NBA in field goal percentage (59.9 percent). 

"I guess I played OK," O'Neal said after the game. "It's hard to play from city to city, where it's 40 degrees in one and 80 degrees in the next, but you keep playing. I knew I had about 7 or 8 blocks, but I didn't know I had

In [30]:
print (inferGamma_basketball)

[[ 61.3440971    5.17892968   6.71547847  11.76149475]
 [ 96.56144301   5.15229507  15.04386555  22.24239637]
 [ 33.80355391  16.42357248   3.2600032   15.51287041]
 [ 49.23688656  23.53509794  13.49913171  43.72888378]
 [ 43.2872193   13.21757028  16.09521952  16.3999909 ]
 [ 68.50959427   1.47446567   1.43327563 125.58266443]
 [ 31.49351015   2.0340144   25.56560917  26.90686628]
 [ 81.01980935   7.66759946  17.8510469   15.46154428]
 [ 50.12369629   1.42111536   1.38756611   9.06762224]
 [ 53.41509405  70.35989986  11.48165929  15.74334679]]


In [31]:
print (inferZ_basketball)

[0, 0, 0, 0, 0, 3, 0, 0, 0, 1]


In [32]:
# infer the topic of each new document
inferGamma_cricket = inferTopicOfNewDocument_cricket()
inferZ_cricket = []
for i in range(0, len(inferGamma_cricket)):
    inferZ_cricket.append(inferGamma_cricket[i, :].argmax())
   

Number of test documents=  10
**
England women will host West Indies next summer as part of their build-up to the Ashes, with the series against Australia once again contested on a points system across all three formats.

Taunton is set to host its first women's Test since 2006, while the Ashes will begin on July 2 at Leicestershire's Grace Road ground, venue for the first two of three ODIs. Canterbury will be the location for the third ODI, while the three T20Is will be held at Chelmsford, Hove and Bristol.

The West Indies visit will feature three ODIs - which will form part of the ICC Women's Championship and go towards qualification for the 2021 World Cup - and three T20Is, starting on June 6 at Grace Road. Worcestershire's New Road ground will host the second ODI, before the series moves to Chelmsford. The teams will then play two T20Is at Northampton, before concluding with a game at Derby.

The Kia Super League, the ECB's domestic women's T20 competition that is set to be replac

In [33]:
print (inferGamma_cricket)

[[ 15.07662523  23.92802458  66.0695049    4.92584529]
 [ 12.36484782  10.73126708 119.78960935  12.11427575]
 [  5.24198251  21.8471165   64.43725826   1.47364272]
 [ 22.17907487  22.90712119 239.59785328  26.31595066]
 [ 10.10325549   1.66766511 149.10857208  11.12050732]
 [ 34.88816734  17.98659483 245.92734316  17.19789467]
 [ 25.28043768  18.42920535 169.80459575  20.48576123]
 [  8.13217193  41.51175676  87.8639149    8.4921564 ]
 [  2.3721617   11.91260087 170.98917098  23.72606645]
 [ 16.65414697  88.80978239  53.97701859  28.55905206]]


In [34]:
print (inferZ_cricket)

[2, 2, 2, 2, 2, 2, 2, 2, 2, 1]
