In [1]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sonia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [21]:
# Populate word-to-index mapping, convert documents into sequences of ints / ids / indices
index = 0
word2index = {}
tokenized_docs = []
for doc in df['text']:
    tokens = word_tokenize(doc.lower())
    doc_as_int = []  # Tokens are in a dictionary, mapping only indices
    for token in tokens:
        if token not in word2index:
            word2index[token] = index
            index += 1
        doc_as_int.append(word2index[token])
    tokenized_docs.append(doc_as_int)

In [54]:
# Reverse mapping
# index2word = {v:k for k, v in word2index.items()}  # Less efficient
index2word = list(word2index.values()) # More efficient

In [55]:
# Number of docs
N = len(df['text'])

In [56]:
# Number of words
V = len(word2index)

In [57]:
# Instantiate term-frequency matrix
# Count vectorizer could also do this
tf = np.zeros((N, V))

In [58]:
# Populate term-frequency counts
for index, doc_as_int in enumerate(tokenized_docs):
    for token_index in doc_as_int:
        tf[index, token_index] += 1


In [59]:
# Compute IDF
document_freq = np.sum(tf > 0, axis=0) # Document frequency (shape = (V,)) !NOTE axis=0, so it doesn't sum all together
idf = np.log(N / (1 + document_freq)) # Log to squash the value
# Compute TF-IDF
tf_idf = tf * idf

In [60]:
np.random.seed(8008)

In [61]:
# Check a random doc, show the top 5 terms in terms of TF-IDF score
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
    for value in word2index.values():
        if value == j:
            word = list(word2index.keys())[list(word2index.values()).index(value)]
            print(word)

Label: sport
Text: IAAF awaits Greek pair's response
Top 5 terms:
iaaf
kenteris
thanou
greek
tests
