In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shali\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')

# Set random seed for reproducibility
np.random.seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Define file paths
train_path = '../data/raw/AG_news/train.csv'
test_path = '../data/raw/AG_news/test.csv'

# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Display basic info
print("Training Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)
print("\nTraining Data Sample:")
print(train_df.head())
print("\nTest Data Columns:", train_df.columns.tolist())

Training Data Shape: (120000, 3)
Test Data Shape: (7600, 3)

Training Data Sample:
   Class Index                                              Title  \
0            3  Wall St. Bears Claw Back Into the Black (Reuters)   
1            3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2            3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3            3  Iraq Halts Oil Exports from Main Southern Pipe...   
4            3  Oil prices soar to all-time record, posing new...   

                                         Description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  

Test Data Columns: ['Class Index', 'Title', 'Description']


In [4]:
# Define preprocessing function
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stop_words and word not in punctuation]
    # Join tokens back to text
    return ' '.join(tokens)

# Test the function
sample_text = train_df['Description'][0]  # Adjust column name if different
print("Original:", sample_text)
print("Preprocessed:", preprocess_text(sample_text))

Original: Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Preprocessed: reuters short-sellers wall street 's dwindling\band ultra-cynics seeing green


In [5]:
# Apply preprocessing to training and test data
train_df['processed_text'] = train_df['Description'].apply(preprocess_text)  # Adjust column name
test_df['processed_text'] = test_df['Description'].apply(preprocess_text)    # Adjust column name

# Combine label and processed text for saving
train_processed = train_df[['Class Index', 'processed_text']]
test_processed = test_df[['Class Index', 'processed_text']]

# Display sample of processed data
print("\nProcessed Training Data Sample:")
print(train_processed.head())


Processed Training Data Sample:
   Class Index                                     processed_text
0            3  reuters short-sellers wall street 's dwindling...
1            3  reuters private investment firm carlyle group ...
2            3  reuters soaring crude prices plus worries\abou...
3            3  reuters authorities halted oil export\flows ma...
4            3  afp tearaway world oil prices toppling records...


In [6]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency

# Fit and transform training data
train_tfidf = tfidf.fit_transform(train_processed['processed_text']).toarray()
test_tfidf = tfidf.transform(test_processed['processed_text']).toarray()

# Get feature names
feature_names = tfidf.get_feature_names_out()

print("TF-IDF Training Shape:", train_tfidf.shape)
print("TF-IDF Test Shape:", test_tfidf.shape)
print("Sample Features:", feature_names[:10])

TF-IDF Training Shape: (120000, 5000)
TF-IDF Test Shape: (7600, 5000)
Sample Features: ['00' '000' '01' '04' '05' '10' '100' '101' '10th' '11']


In [7]:
# Define output paths
processed_dir = '../data/processed/'
os.makedirs(processed_dir, exist_ok=True)

# Save preprocessed text
train_processed.to_csv(os.path.join(processed_dir, 'train_preprocessed.csv'), index=False)
test_processed.to_csv(os.path.join(processed_dir, 'test_preprocessed.csv'), index=False)

# Save TF-IDF vectors as pickle files
with open(os.path.join(processed_dir, 'train_tfidf.pkl'), 'wb') as f:
    pickle.dump(train_tfidf, f)
with open(os.path.join(processed_dir, 'test_tfidf.pkl'), 'wb') as f:
    pickle.dump(test_tfidf, f)

# Save TF-IDF vectorizer for later use
with open(os.path.join(processed_dir, 'tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(tfidf, f)

print("Preprocessed data saved to", processed_dir)

Preprocessed data saved to ../data/processed/
