In [1]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\te20312262\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv("bbc_text_cls.csv")

In [5]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [7]:
# populate word2idx
# convert documents into sequence of ints/ ids / indices
idx =0
word2idx = {}
tokenized_docs=[]
for row in df["text"]:
    doc_as_int = []
    words =word_tokenize(row.lower())
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx+=1

        #save for later use
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

In [8]:
# reverse mapping
idx2word = {v:k for k,v in word2idx.items()}

In [9]:
# number of documents
N= len(df["text"])

In [10]:
# number of words
V = len(word2idx)

In [11]:
# instantiate term frequency count
# note: could have also used count vectorizer
tf = np.zeros((N,V))

In [13]:
# populate term frequency count
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i,j]+=1

In [15]:
print(N,V)

2225 34762


In [18]:
len(np.sum(tf,axis=1))

2225

In [25]:
len(np.sum(tf>0,axis=0))

34762

In [28]:
# compute IDF
document_freq = np.sum(tf>0,axis =0) # document frequency (shape= (V,))
idf = np.log(N/document_freq)

In [29]:
tf_idf = tf*idf

In [41]:
tf

array([[1., 4., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [43]:
tf.shape

(2225, 34762)

In [42]:
idf

array([5.22260554, 2.3893922 , 2.86332511, ..., 7.70751219, 7.70751219,
       7.70751219])

In [44]:
idf.shape

(34762,)

In [40]:
tf_idf

array([[5.22260554, 9.5575688 , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 7.70751219, 7.70751219,
        7.70751219]])

In [46]:
tf_idf.shape

(2225, 34762)

In [30]:
np.random.seed(123)

In [47]:
# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
i=20
row = df.iloc[i]
print("Label:", row["labels"])
print("Text:", row["text"].split("\n")[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
    print(idx2word[j])

Label: business
Text: Rank 'set to sell off film unit'
Top 5 terms:
rank
deluxe
demerger
casinos
leisure
