In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\lilit\PycharmProjects\nlp-course-2025.1\Lilit Mnatsakanyan\data\bbc-text.csv")

In [2]:
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [10]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lilit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lilit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lilit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [12]:
df['tokens'] = df['text'].apply(preprocess_text)


In [14]:
df

Unnamed: 0,category,text,tokens
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, say..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raids, box, office, ocean, twe..."
...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,"[cars, pull, us, retail, figures, us, retail, ..."
2221,politics,kilroy unveils immigration policy ex-chatshow ...,"[kilroy, unveils, immigration, policy, exchats..."
2222,entertainment,rem announce new glasgow concert us band rem h...,"[rem, announce, new, glasgow, concert, us, ban..."
2223,politics,how political squabbles snowball it s become c...,"[political, squabbles, snowball, become, commo..."


In [15]:
from collections import Counter


In [16]:
def build_vocabulary(tokenized_texts):
    vocab = set()
    for tokens in tokenized_texts:
        vocab.update(tokens)
    return sorted(vocab)

vocab = build_vocabulary(df['tokens'])
vocab_index = {word: i for i, word in enumerate(vocab)}

In [17]:
len(vocab)

30190

In [18]:
def text_to_bow(tokens, vocab_index):
    vec = [0] * len(vocab_index)
    token_counts = Counter(tokens)
    for token, count in token_counts.items():
        if token in vocab_index:
            vec[vocab_index[token]] = count
    return vec

In [19]:
sample_vec = text_to_bow(df['tokens'][0], vocab_index)


In [20]:
sample_vec[:20]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [21]:
bow_vectors = [text_to_bow(tokens, vocab_index) for tokens in df['tokens']]


In [22]:
import math


In [23]:
def compute_tf(tokens):
    tf = Counter(tokens)
    total_terms = len(tokens)
    return {word: count / total_terms for word, count in tf.items()}


In [26]:
from tqdm import tqdm
from math import log

def compute_idf(corpus):
    N = len(corpus)
    idf_dict = {}
    all_tokens = set([word for tokens in corpus for word in tokens])

    for word in tqdm(all_tokens, desc="Computing IDF"):
        df = sum(1 for tokens in corpus if word in tokens)
        idf_dict[word] = log(N / (1 + df))
    return idf_dict

In [27]:
idf_scores = compute_idf(df['tokens'])


Computing IDF: 100%|██████████| 30190/30190 [02:27<00:00, 204.62it/s]


In [28]:
def compute_tfidf(tokens, idf_scores):
    tf = compute_tf(tokens)
    tfidf = {word: tf[word] * idf_scores.get(word, 0) for word in tf}
    return tfidf

In [29]:
sample_tfidf = compute_tfidf(df['tokens'][0], idf_scores)


In [30]:
sample_tfidf

{'tv': 0.07044891009600367,
 'future': 0.010327813898301442,
 'hands': 0.016465864215777636,
 'viewers': 0.027274046752097318,
 'home': 0.008802506791119928,
 'theatre': 0.009425826004158352,
 'systems': 0.00826463580527434,
 'plasma': 0.013828183422409102,
 'highdefinition': 0.04265568908711416,
 'tvs': 0.02456738080343062,
 'digital': 0.012844039281540926,
 'video': 0.013028336065120553,
 'recorders': 0.02425023819341798,
 'moving': 0.008429597237307827,
 'living': 0.008606436302725735,
 'room': 0.009003589500653048,
 'way': 0.0069944866512508036,
 'people': 0.0176359341033261,
 'watch': 0.0335828280437538,
 'radically': 0.013045741822609263,
 'different': 0.006137656122257788,
 'five': 0.00434883535508087,
 'years': 0.0058833113222466395,
 'time': 0.005363375967899377,
 'according': 0.004659388701777986,
 'expert': 0.01066392227177854,
 'panel': 0.018364718825789772,
 'gathered': 0.010065342216108393,
 'annual': 0.006977832295195966,
 'consumer': 0.014390116469232488,
 'electronics'