In [1]:
from dataset_500 import DataReader
dataset = DataReader().fetch_data()
x, y = dataset.train_data

Data loading started...
data has been loaded!


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np

n_features = 2000
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                             stop_words='english')

tfidf = vectorizer.fit_transform(x)
feature_names = vectorizer.get_feature_names()

In [3]:
n_topics = 5

In [8]:
nmf = NMF(n_components=n_topics, random_state=1)
W = nmf.fit_transform(tfidf)
H = nmf.components_

In [9]:
nmf_based = W@H
np.linalg.norm(nmf_based - tfidf)

20.16947013800678

In [10]:
def nmf_keywords(text_id, n_keywords = 10):
    indx = np.argsort(nmf_based[text_id,:])[-n_keywords:]
    return [feature_names[i] for i in reversed(indx)]

In [11]:
nmf_keywords(0)

['said',
 'police',
 'new',
 'people',
 'year',
 'man',
 'time',
 'friday',
 'game',
 'like']

In [12]:
s = 0
n = len(x)
for i in range(n):
    ans = nmf_keywords(i)
    s += len(set(map(lambda x : x.lower(), y[i])) & set(ans))
s/n


0.8111111111111111

In [14]:
import scipy

U, S, Vt = scipy.sparse.linalg.svds(tfidf, n_topics)
svd_based = U @ scipy.sparse.diags(S) @ Vt

def svd_keywords(text_id, n_keywords = 10):
    indx = np.argsort(svd_based[text_id,:])[-n_keywords:]
    return [feature_names[i] for i in reversed(indx)]

In [15]:
s = 0
n = len(x)
for i in range(n):
    ans = svd_keywords(i)
    s += len(set(map(lambda x : x.lower(), y[i])) & set(ans))
s/n

0.84

In [58]:
import nltk

text_ind = 0

n_topics=5

def nmf_keywords_separate(text_id, n_keywords = 10):
    vectorizer = TfidfVectorizer(max_features = n_features,
                                 stop_words = 'english')
    sentences = nltk.sent_tokenize(x[text_id])
    tfidf = vectorizer.fit_transform(sentences)
    feature_names = vectorizer.get_feature_names()
    nmf = NMF(n_components=min(n_topics, len(sentences)), random_state=1)
    W = nmf.fit_transform(tfidf)
    H = nmf.components_
    
    indx = np.argsort(H[0,:])[-n_keywords:]
    return [feature_names[i] for i in reversed(indx)]

In [61]:
nmf_keywords_separate(0)

['luisana',
 'star',
 'tv',
 'ap',
 'michael',
 'buble',
 'thursday',
 'argentina',
 'canadian',
 'lopilato']

In [54]:
s = 0
n = len(x)
for i in range(n):
    ans = nmf_keywords_separate(i)
    s += len(set(map(lambda x : x.lower(), y[i])) & set(ans))
s/n

  return np.sqrt(res * 2)


3.58

In [76]:
def svd_keywords_separate(text_id, n_keywords = 10):
    vectorizer = TfidfVectorizer(max_features = n_features,
                                 stop_words = 'english')
    sentences = nltk.sent_tokenize(x[text_id])
    tfidf = vectorizer.fit_transform(sentences)
    feature_names = vectorizer.get_feature_names()
    if len(sentences) < 2:
        U, S, Vt = scipy.linalg.svd(tfidf.toarray())
    else:
        U, S, Vt = scipy.sparse.linalg.svds(tfidf, 1)
    
    U = np.fliplr(U)
    Vt = np.flipud(Vt)
    S = S[::-1]
    
    indx = np.argsort(Vt[0,:])[-n_keywords:]
    return [feature_names[i] for i in reversed(indx)]

In [77]:
s = 0
n = len(x)
for i in range(n):
    ans = svd_keywords_separate(i)
    s += len(set(map(lambda x : x.lower(), y[i])) & set(ans))
s/n

3.188888888888889

In [79]:
svd_keywords_separate(2)

['kenny',
 'little',
 'chance',
 'stars',
 'reported',
 'script',
 'nightmarish',
 'probably',
 'genuinely',
 'way']