In [1]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict, Counter
import math

In [None]:
def preprocess_all(df):
    df['text'] = df['text'].apply(lambda x: x.encode("ascii", "ignore").decode())
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    df['text'] = df['text'].apply(lambda x: x.lower())
    df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '<NUM>', x))
    df['text'] = df['text'].apply(lambda x: word_tokenize(x))
    stop_words = set(stopwords.words('english'))
    df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    return df['text']

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

validation_df = train_df.sample(n=100, random_state=71)
new_train_df = train_df.drop(validation_df.index)

In [None]:
new_train_tokens = preprocess_all(new_train_df)
validation_tokens = preprocess_all(validation_df)
test_tokens = preprocess_all(test_df)

In [6]:
print(len(new_train_tokens))
print(len(validation_tokens))
print(len(test_tokens))

13779
100
100


In [15]:
min_count = max(1, int(len(new_train_df) * 0.01))

article_counts_uni = defaultdict(int)
for tokens in new_train_tokens:
    unique_tokens = set(tokens)
    for token in unique_tokens:
        article_counts_uni[token] += 1

unigram_vocab = {token for token, cnt in article_counts_uni.items() if cnt >= min_count}

def replace_oov(tokens, vocab):
    return [t if t in vocab else '<UNK>' for t in tokens]

new_train_tokens = [replace_oov(t, unigram_vocab) for t in new_train_tokens]
validation_tokens = [replace_oov(t, unigram_vocab) for t in validation_tokens]
test_tokens = [replace_oov(t, unigram_vocab) for t in test_tokens]

def build_ngram_vocab(tokenized_texts, n, min_count):
    article_counts = defaultdict(int)
    
    for tokens in tokenized_texts:
        ngrams = set(zip(*[tokens[i:] for i in range(n)]))  # Generate unique n-grams
        for ngram in ngrams:
            article_counts[ngram] += 1
    
    ngram_vocab = {ngram for ngram, count in article_counts.items() if count >= min_count}
    return ngram_vocab

bigram_vocab = build_ngram_vocab(new_train_tokens, n=2, min_count=min_count)
trigram_vocab = build_ngram_vocab(new_train_tokens, n=3, min_count=min_count)

In [14]:
print(len(unigram_vocab))
print(len(bigram_vocab))
print(len(trigram_vocab))

8590
6789
3782
