In [1]:
import requests
import pandas as pd
from io import BytesIO

def get_csv_as_df(url):
    res = requests.get(url)
    with BytesIO(res.content) as f:
        df = pd.read_csv(f)
    return df

In [2]:
df = get_csv_as_df('https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv')

In [3]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
import matplotlib.pyplot as plt
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np
import nltk
from nltk import word_tokenize

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
num_docs = len(df['text'])

In [8]:
from collections import Counter
def tokens2idx(tokenized_docs):
    unique_tokens = {
        tok for token_doc in tokenized_docs
        for tok in token_doc 
    }
    return {token:i for i, token in enumerate(unique_tokens)}

def count_tokens(tokenized_docs):
    return [
        Counter(doc) for doc in tokenized_docs
    ]

def create_word2idx(series):
    make_tokenized_docs = lambda: (word_tokenize(doc.lower()) for doc in series)
    mapping = tokens2idx(make_tokenized_docs())
    counted_docs = count_tokens(make_tokenized_docs())
    return counted_docs, mapping

In [9]:
counted_docs, word2idx = create_word2idx(df['text'])

In [10]:
num_words = len(word2idx)
idx2word = [None]*num_words
for word, idx in word2idx.items():
    idx2word[idx] = word

In [13]:
def count2array(counted_docs, mapping):
    freq_array = np.zeros((num_docs, len(mapping)))
    for row_idx, count in enumerate(counted_docs):
        for tok_idx, word in enumerate(idx2word):
            freq_array[row_idx][tok_idx] = count.get(word, 0)
    return freq_array

In [18]:
tf = count2array(counted_docs, word2idx)

In [20]:
document_freq = np.sum(tf > 0, axis=0) # document frequency (shape = (V,))
idf = np.log(num_docs / document_freq)

In [21]:
tfidf = tf*idf

In [23]:
tfidf[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [25]:
i = np.random.choice(num_docs)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tfidf[i]
indices = (-scores).argsort()

for j in indices[:5]:
  print(idx2word[j])

Label: politics
Text: Report attacks defence spending
Top 5 terms:
defence
projects
procurement
nao
delays
