In [2]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"Is MPS (GPU) available? {torch.backends.mps.is_available()}")

PyTorch version: 2.8.0
Is MPS (GPU) available? True


In [3]:
class_idx = {1: "World" , 2: "Sports" , 3: "Business" , 4: "Sci/Tech"}
print(class_idx)

{1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}


# Data Preprocessing Pipeline

### Choose and Load Tools (The Model and its Tokenizer)

In [None]:
from transformers import AutoModel , AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertModel
# Successfully downloaded the specific "vocabulary" and "grammar rules" for the distilbert-base-uncased model.
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 
# Downloaded the pre-trained "master chef" model itself, with all its existing knowledge of the English language.
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [8]:
# Loading data
from datasets import load_dataset, DatasetDict
train_csv = "/Users/tushar04master/Documents/news-classifier/data/train.csv"
test_csv = "/Users/tushar04master/Documents/news-classifier/data/test.csv"

# Using Dataset library
ag_news_dataset = load_dataset('csv', data_files={'train': train_csv, 'test': test_csv})
print("Successfully loaded local CSV files:")
print(ag_news_dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Successfully loaded local CSV files:
DatasetDict({
    train: Dataset({
        features: ['Class Index', 'Title', 'Description'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['Class Index', 'Title', 'Description'],
        num_rows: 7600
    })
})


### Create a Tokenization "Worker" Function

In [11]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Worker Function
def tokenize_fxn(examples):
     """
    Takes a batch of text from the dataset and applies the tokenizer.
    """
     return tokenizer(
          examples["Description"],
          truncation = True,
          max_length = 256
     )
    

In [12]:
#Apply Your Worker to the Entire Dataset
print("\nTokenizing the dataset...")
# The .map() function runs our tokenize_function on every example.
# batched=True makes the process much faster by working on chunks at a time.
tokenized_datasets = ag_news_dataset.map(tokenize_fxn, batched=True)
print("Tokenization complete!")


Tokenizing the dataset...


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Tokenization complete!


In [13]:
print("\nExample of a tokenized data point:")
print(tokenized_datasets['train'][0])


Example of a tokenized data point:
{'Class Index': 3, 'Title': 'Wall St. Bears Claw Back Into the Black (Reuters)', 'Description': "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'input_ids': [101, 26665, 1011, 2460, 1011, 19041, 1010, 2813, 2395, 1005, 1055, 1040, 11101, 2989, 1032, 2316, 1997, 11087, 1011, 22330, 8713, 2015, 1010, 2024, 3773, 2665, 2153, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


# That's Cool ðŸ˜™