In [1]:
from dataset_500 import DataReader
dataset = DataReader().fetch_data()
x, y = dataset.train_data

Data loading started...
data has been loaded!


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np

n_features = 2000
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                             stop_words='english')

tfidf = vectorizer.fit_transform(x)
feature_names = vectorizer.get_feature_names()

In [3]:
n_topics = 5

In [8]:
nmf = NMF(n_components=n_topics, random_state=1)
W = nmf.fit_transform(tfidf)
H = nmf.components_

In [9]:
nmf_based = W@H
np.linalg.norm(nmf_based - tfidf)

20.16947013800678

In [10]:
def nmf_keywords(text_id, n_keywords = 10):
    indx = np.argsort(nmf_based[text_id,:])[-n_keywords:]
    return [feature_names[i] for i in reversed(indx)]

In [11]:
nmf_keywords(0)

['said',
 'police',
 'new',
 'people',
 'year',
 'man',
 'time',
 'friday',
 'game',
 'like']

In [12]:
s = 0
n = len(x)
for i in range(n):
    ans = nmf_keywords(i)
    s += len(set(map(lambda x : x.lower(), y[i])) & set(ans))
s/n


0.8111111111111111

In [14]:
import scipy

U, S, Vt = scipy.sparse.linalg.svds(tfidf, n_topics)
svd_based = U @ scipy.sparse.diags(S) @ Vt

def svd_keywords(text_id, n_keywords = 10):
    indx = np.argsort(svd_based[text_id,:])[-n_keywords:]
    return [feature_names[i] for i in reversed(indx)]

In [15]:
s = 0
n = len(x)
for i in range(n):
    ans = svd_keywords(i)
    s += len(set(map(lambda x : x.lower(), y[i])) & set(ans))
s/n

0.84

In [82]:
import nltk

text_ind = 0

n_topics=5

def nmf_keywords_separate(text, n_keywords = 10):
    vectorizer = TfidfVectorizer(max_features = n_features,
                                 stop_words = 'english')
    sentences = nltk.sent_tokenize(text)
    tfidf = vectorizer.fit_transform(sentences)
    feature_names = vectorizer.get_feature_names()
    nmf = NMF(n_components=min(n_topics, len(sentences)), random_state=1)
    W = nmf.fit_transform(tfidf)
    H = nmf.components_
    
    indx = np.argsort(H[0,:])[-n_keywords:]
    return [feature_names[i] for i in reversed(indx)]

In [84]:
nmf_keywords_separate(x[0])

['luisana',
 'star',
 'tv',
 'ap',
 'michael',
 'buble',
 'thursday',
 'argentina',
 'canadian',
 'lopilato']

In [85]:
s = 0
n = len(x)
for i in range(n):
    ans = nmf_keywords_separate(x[i])
    s += len(set(map(lambda x : x.lower(), y[i])) & set(ans))
s/n

  return np.sqrt(res * 2)


3.58

In [93]:
s = 0
n = len(x)
for i in range(n):
    ans = nmf_keywords_separate(x[i])
    ans = set(ans)
    real_ans = set(map(lambda x : x.lower(), y[i]))
    s += len(real_ans & ans)/len(real_ans | ans)  

print("Jaccard Index:", s/n)

  return np.sqrt(res * 2)


Jaccard Index: 0.07596959608603467


In [86]:
def svd_keywords_separate(text, n_keywords = 10):
    vectorizer = TfidfVectorizer(max_features = n_features,
                                 stop_words = 'english')
    sentences = nltk.sent_tokenize(text)
    tfidf = vectorizer.fit_transform(sentences)
    feature_names = vectorizer.get_feature_names()
    if len(sentences) < 2:
        U, S, Vt = scipy.linalg.svd(tfidf.toarray())
    else:
        U, S, Vt = scipy.sparse.linalg.svds(tfidf, 1)
    
    U = np.fliplr(U)
    Vt = np.flipud(Vt)
    S = S[::-1]
    
    indx = np.argsort(Vt[0,:])[-n_keywords:]
    return [feature_names[i] for i in reversed(indx)]

In [87]:
s = 0
n = len(x)
for i in range(n):
    ans = svd_keywords_separate(x[i])
    s += len(set(map(lambda x : x.lower(), y[i])) & set(ans))
s/n

3.2066666666666666

In [92]:
s = 0
n = len(x)
for i in range(n):
    ans = svd_keywords_separate(x[i])
    ans = set(ans)
    real_ans = set(map(lambda x : x.lower(), y[i]))
    s += len(real_ans & ans)/len(real_ans | ans)  

print("Jaccard Index:", s/n)

Jaccard Index: 0.06921680547600612


In [88]:
svd_keywords_separate(x[2])

['kutcher',
 'ashton',
 'bieber',
 'justin',
 'film',
 'playing',
 'worst',
 'version',
 'buddy',
 'look']

In [80]:
with open('reuters.txt', 'r') as file:
    t = file.read()

In [89]:
svd_keywords_separate(t)

['nakamoto',
 'use',
 'secure',
 'satoshi',
 'cryptography',
 'regulation',
 'traditional',
 'transactions',
 'successfully',
 'making']

In [90]:
nmf_keywords_separate(t)

['19',
 'contract',
 'price',
 'reference',
 'set',
 '18',
 '805',
 'january',
 'sponsored',
 'exchange']

In [91]:
t

'Bitcoin hits bigger stage as exchange giant CME launches futures\nGertrude Chavez-Dreyfuss, Swati Pandey\n\n4 Min Read\n\nNEW YORK/SYDNEY (Reuters) - Bitcoin futures received a lukewarm reception at its launch on the CME Group Inc on Sunday, although market experts believe a recent rally in the cryptocurrency has further to go.\nFILE PHOTO: A copy of bitcoin standing on PC motherboard is seen in this illustration picture, October 26, 2017. REUTERS/Dado Ruvic/File Photo\n\nThe CME bitcoin front-month futures contract did open higher at $20,650, but dropped 6 percent within the first half hour.\n\nSponsored\n\nThe contract was last at $18,805, below the $19,500 reference price set by the exchange for the January contract.\n\nThe reference price, from which price limits are set, is $19,600 for the February contract, $19,700 for March and $19,900 for June, according to CME.\n\nOn Dec. 10, Chicago-based derivatives exchange Cboe Global Markets launched bitcoin futures, which saw the price 