In [None]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
df = pd.read_csv('bbc_text_cls.csv')

In [None]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [None]:
# populate word2idx
# convert documents into sequences of ints / ids / indices
idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
  words = word_tokenize(doc.lower())
  doc_as_int = []
  for word in words:
    if word not in word2idx:
      word2idx[word] = idx
      idx += 1

    # save for later
    doc_as_int.append(word2idx[word])
  tokenized_docs.append(doc_as_int)

In [None]:
list(word2idx.items())[:10]

[('ad', 0),
 ('sales', 1),
 ('boost', 2),
 ('time', 3),
 ('warner', 4),
 ('profit', 5),
 ('quarterly', 6),
 ('profits', 7),
 ('at', 8),
 ('us', 9)]

In [None]:
np.reshape(tokenized_docs[0], (1, -1))

array([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         16,  26,  27,  28,  17,  29,  30,  31,  23,  32,  27,  33,  34,
         35,  36,  37,  23,  38,  39,  40,  41,  27,  42,  28,   1,  37,
         43,  44,  45,  46,  47,  48,   1,  31,  12,  49,  50,  51,   1,
         52,  53,  15,  16,  17,  54,  28,  17,  55,  31,  56,   7,  57,
         58,  59,  60,  61,  33,  62,  63,   5,  64,   8,   4,  65,  27,
         46,  66,  67,  22,  68,  31,   3,   4,  49,  69,  70,  71,  72,
         35,  73,  74,  15,  37,  75,  41,  31,  76,  56,  77,  44,  78,
         27,  68,  27,  79,  80,  81,  82,  31,  72,  83,  84,  85,  40,
         23,  50,  51,   7,  57,  86,  87,  40,  23,  88,  24,  89,  31,
         90,  27,  23,  91,  49,  68,  92,  93,   5,  94,  95,  96,  52,
         74,  15,  69,  23,  97,  37,  98,  44,  99, 100,  31,  72, 101,
         16, 102,  85,  59, 103,  23, 104, 105, 106

In [None]:
# reverse mapping
# if you do it smarter you can store it as a list
idx2word = {v:k for k, v in word2idx.items()}

In [None]:
list(idx2word.items())[:10]

[(0, 'ad'),
 (1, 'sales'),
 (2, 'boost'),
 (3, 'time'),
 (4, 'warner'),
 (5, 'profit'),
 (6, 'quarterly'),
 (7, 'profits'),
 (8, 'at'),
 (9, 'us')]

In [None]:
# number of documents
N = len(df['text'])

In [None]:
# number of words
V = len(word2idx)

In [None]:
# instantiate term-frequency matrix
# note: could have also used count vectorizer
tf = np.zeros((N, V))

In [None]:
# populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
  for j in doc_as_int:
    tf[i, j] += 1

In [None]:
# compute IDF
document_freq = np.sum(tf > 0, axis=0) # document frequency (shape = (V,))
idf = np.log(N / document_freq)

In [None]:
# compute TF-IDF
tf_idf = tf * idf

In [None]:
np.random.seed(123)

In [None]:
# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
  print(idx2word[j])

Label: sport
Text: Athens memories soar above lows
Top 5 terms:
paula
athens
1500m
her
kelly
