In [None]:
import re
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
import nltk

# The correct packages to download
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

# Initialize resources
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))



In [None]:
# Convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df, val_df = train_test_split(train_df, test_size = 500, random_state = 42)
val_df.to_csv('validation.csv')

In [None]:
def normalize_numbers(text):
    def replace_func(match):
        num = match.group()
        if 1900 <= int(num) <= 2100:
            return num  # Keep years
        else:
            return 'NUM'
    return re.sub(r'\b\d+\b', replace_func, text)

In [None]:
# Update your preprocess function to use stopwords_set instead of stopwords
def preprocess(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # remove non-ASCII
    text = re.sub(r'[^\w\s]', '', text)        # remove punctuation
    text = normalize_numbers(text)             # replace numbers except years
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords_set]  # Use stopwords_set here

    # Lemmatize each token with POS tag
    tagged = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in tagged]

    return ' '.join(lemmatized_tokens)

In [None]:
print("Preprocessing started..")
list1 = [train_df, test_df, val_df]
for ent in list1:
    ent['text'] = ent['text'].apply(preprocess)
    ent['title'] = ent['title'].apply(preprocess)

In [None]:
os.makedirs('./part_A_outputs', exist_ok = True)
train_df.to_csv('./part_A_outputs/train.csv', index = False)
val_df.to_csv('./part_A_outputs/validation.csv', index = False)
test_df.to_csv('./part_A_outputs/test.csv', index = False)

In [None]:
min_tokens = float('inf')
max_tokens = float('-inf')
print(type(train_df))
for text1 in train_df['text']:
    token_len = len(word_tokenize(text1))  
    min_tokens = min(min_tokens, token_len)
    max_tokens = max(max_tokens, token_len)

print(f"minimum length: {min_tokens}\nmaximum length: {max_tokens}")


In [None]:
import numpy as np
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import os
import pandas as pd

# Step 1: Compute token lengths for each text
train_df = pd.read_csv(os.path.join('.','part_A_outputs','train.csv')) 
train_df['tokens'] = train_df['text'].apply(word_tokenize)
train_df['token_len'] = train_df['tokens'].apply(len)

# Step 2: Plot distribution (optional but useful)
plt.figure(figsize=(10, 6))
train_df['token_len'].hist(bins=50)
plt.xlabel("Token Length")
plt.ylabel("Frequency")
plt.title("Distribution of Token Lengths in Training Data")
plt.grid(True)
plt.show()



# Step 3: Calculate 85th percentile threshold
threshold_85 = int(np.percentile(train_df['token_len'], 85))
print(f"85th Percentile Threshold: {threshold_85:.2f} tokens")



# Step 4: Filter rows within the 85th percentile
train_df['text'] = train_df['tokens'].apply(lambda tokens : ' '.join(tokens[:threshold_85]))

# Optional: Drop the helper column if no longer needed
train_df.drop(columns=['token_len', 'tokens'], inplace=True)
train_df.to_csv('./part_A_outputs/train.csv', index = False)
