In [1]:
import wikipediaapi as wikiapi
import wikipedia as wiki
import numpy as np
import nltk
import os
import time

In [2]:
topics = ['computer', 'sport', 'politics', 'science']

In [3]:
def sentenceSplit(inputContent):
    res = nltk.sent_tokenize(inputContent)
    return res

In [4]:
text_dict = {}
for tp in topics:
    fileNames = os.listdir(os.path.join('data/', tp))
    for fileName in fileNames:
        with open(os.path.join(os.path.join('data', tp), fileName)) as f:
            text_dict[fileName] = sentenceSplit(f.read())

In [5]:
# get the stopwords
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [6]:
def sentenceRemoveStopwords(inputSentence):
    """
        input: the string of input
        output: list of stemmed tokens
    """
    # lowercase
    inputLower = inputSentence.lower()
    # get the word and remove the punctuation
    regTokenizer = RegexpTokenizer(r'\w+')
    tokens_filter = [t for t in regTokenizer.tokenize(inputLower) if t not in stop_words]
    # stem the word
    tokens_stemmer = [stemmer.stem(t) for t in tokens_filter]
    return tokens_stemmer

In [7]:
wordTokens = defaultdict(list)
for fileName, text in text_dict.items():
    for sen in text:
        wordTokens[fileName].append(sentenceRemoveStopwords(sen))

In [8]:
# generate word embedding
allSents = []
for s in wordTokens.values():
    allSents = allSents + s

In [26]:
from gensim.models import Word2Vec
model = Word2Vec(allSents, iter=50)

In [88]:
model.wv.similarity('0ad', 'comput')

KeyError: "word '0ad' not in vocabulary"

In [17]:
words = list(model.wv.vocab)

In [90]:
len(words)

2562

In [35]:
words[-20:]

['sf',
 'specul',
 'literari',
 'comic',
 'hugo',
 'amaz',
 'feminist',
 'horror',
 'stargat',
 'supernatur',
 'wonder',
 'fandom',
 'fanzin',
 'ptolemi',
 'bachelor',
 'politic',
 'comt',
 'durkheim',
 'weber',
 'marxist']

In [38]:
from gensim.models import TfidfModel

In [39]:
text_complete = [[word for sent in sents for word in sent] for sents in wordTokens.values()]

In [42]:
from gensim import corpora
dictionary = corpora.Dictionary(text_complete)

In [44]:
bag_of_word = [dictionary.doc2bow(text) for text in text_complete]

In [45]:
tfidfModel = TfidfModel(dictionary=dictionary, corpus=bag_of_word)

In [47]:
tfidfModel.

33

In [48]:
len(text_complete)

33

In [51]:
tfidfValue = tfidfModel[dictionary.doc2bow(text_complete[1])]

In [70]:
num_of_doc = len(text_complete)
num_of_topics = len(topics)
num_of_terms = len(tfidfModel.id2word)

In [71]:
tensor_res = np.zeros((num_of_doc, num_of_topics, num_of_terms))

In [84]:
topics_stem = ['comput', 'sport', 'politic', 'scienc']

In [91]:
topics_sim = np.zeros((num_of_terms, num_of_topics))
for idx_term in range(num_of_terms):
    for idx_topic in range(num_of_topics):
        try:
            topics_sim[idx_term, idx_topic] = model.similarity(topics_stem[idx_topic], tfidfModel.id2word[idx_term])
        except:
            topics_sim[idx_term, idx_topic] = 0

  """


In [93]:
num_of_topics * num_of_terms

35652

In [73]:
def fillTFIDF(idxList):
    res = np.zeros((num_of_terms))
    for idx, val in idxList:
        res[idx] = val
    return res

In [95]:
for idx_text in range(num_of_doc):
    tfidfValue = tfidfModel[dictionary.doc2bow(text_complete[idx_text])]
    tfidfList = fillTFIDF(tfidfValue)
    for idx_topic in range(num_of_topics):
        tensor_res[idx_text, idx_topic, :] = topics_sim[:,idx_topic] * tfidfList

In [99]:
tensor_res[tensor_res < 0] = 0

In [103]:
np.sum(tensor_res > 0)

54137

In [89]:
num_of_terms

8913

In [None]:
from scipy.io import savemat
savemat('tensor_0_7.mat', {'t_new': tensor_res})

In [104]:
np.max(tensor_res)

0.8068208195512172

In [111]:
dictionary.token2id['comput']

282

In [112]:
tfidfModel.id2word[282]

'comput'

In [115]:
from scipy.io import savemat
savemat('tensor_we.mat', {'t_new': tensor_res})

In [116]:
len(text_dict)

33

In [119]:
text_dict.keys()

dict_keys(['Analog computer.txt', 'Computer graphics (computer science).txt', 'Computer graphics.txt', 'Computer hardware.txt', 'Computer network.txt', 'Computer program.txt', 'Computer science.txt', 'Computer security.txt', 'Computer.txt', 'Computing.txt', 'Cap (sport).txt', 'Combat sport.txt', 'Rai Sport.txt', 'Sport (botany).txt', 'Sport (US magazine).txt', 'Sport Chek.txt', 'Sport Express.txt', 'Sport psychology.txt', 'Sport.txt', 'Squash (sport).txt', 'Gridlock (politics).txt', 'Political economy.txt', 'Political party.txt', 'Political science.txt', 'Political spectrum.txt', 'Politics.txt', 'PS – Political Science & Politics.txt', 'Branches of science.txt', 'Forensic science.txt', 'Natural science.txt', 'Science fiction.txt', 'Science.txt', 'Social science.txt'])

In [None]:
1:
    1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1

In [None]:
5:
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     2
     2
     2
     2
     2
     2
     2
     2
     2
     2
     4
     4
     4
     4
     4
     4
     4
     4
     4
     4
     4
     4
     4b

In [None]:
10:
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     2
     2
     2
     2
     2
     2
     2
     2
     2
     2
     3
     3
     3
     3
     3
     3
     3
     4
     3
     4
     4
     4
     4

In [None]:
20:
     1
     1
     1
     1
     1
     1
     1
     3
     1
     1
     3
     2
     2
     2
     2
     2
     2
     2
     2
     3
     3
     3
     3
     3
     3
     3
     3
     4
     3
     4
     4
     4
     3

In [None]:
50:
    1
     1
     1
     1
     1
     3
     1
     3
     1
     1
     3
     2
     2
     2
     2
     2
     2
     2
     2
     2
     3
     3
     3
     3
     3
     3
     3
     4
     3
     4
     4
     4
     3