## Project 3
Text Classification Transformer

In [None]:
# Project 3 new
import pandas as pd
from transformers import AutoTokenizer, TrainingArguments, Trainer, BertForSequenceClassification
from datasets import Dataset

# Load the CSV
df = pd.read_csv(r"C:\Users\timha\Project_3\redsox_commentary.csv")

# Encode sentiment labels
label_map = {'negative': 0, 'positive': 1}
df['label'] = df['sentiment'].map(label_map)

# Display the first few rows
print("First five rows:")
print(df.head())


First five rows:
             timestamp sentiment                             comment  label
0  2025-04-14 23:35:41  negative      Bullpen gave up the lead late.      0
1  2025-04-14 23:40:41  negative    Poor communication in the field.      0
2  2025-04-14 23:45:41  negative        Couldn't get the bats going.      0
3  2025-04-14 23:50:41  positive   Outstanding play by the outfield.      1
4  2025-04-14 23:55:41  negative  Pitching struggled from the start.      0


In [43]:
# Encoded sentiment labels
print("Encoding sentiment labels...")
label_map = {'negative': 0, 'positive': 1}
df['label'] = df['sentiment'].map(label_map)
print(f"Encoded labels:\n{df['label'].value_counts()}")

Encoding sentiment labels...
Encoded labels:
label
0    55
1    45
Name: count, dtype: int64


In [44]:
# 2. Initialized Tokenizer
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print("Tokenizer loaded.")

# 3. Tokenization Function
print("Defining tokenization function...")
def tokenize(batch):
    return tokenizer(batch['comment'], padding=True, truncation=True)

# 4. Converted to Hugging Face Dataset
print("Converting to Hugging Face dataset...")
dataset = Dataset.from_pandas(df)
print("Dataset created. Tokenizing the dataset...")
dataset = dataset.map(tokenize, batched=True)
print("Tokenization complete.")

Initializing tokenizer...
Tokenizer loaded.
Defining tokenization function...
Converting to Hugging Face dataset...
Dataset created. Tokenizing the dataset...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenization complete.


In [45]:
# 5. Removed Unnecessary Columns and Set Format for PyTorch
print("Cleaning up dataset by removing unnecessary columns...")
dataset = dataset.remove_columns(['timestamp', 'sentiment'])
dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
print("Dataset ready for training.")

Cleaning up dataset by removing unnecessary columns...
Dataset ready for training.


In [46]:
# 6. Split the Dataset
print("Splitting dataset into train and test sets...")
split_dataset = dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']
print(f"Split complete. Training samples: {len(train_dataset)}, Testing samples: {len(test_dataset)}")


Splitting dataset into train and test sets...
Split complete. Training samples: 80, Testing samples: 20


In [47]:
# 7. Loading the Pre-trained BERT Model
print("Loading pre-trained BERT model for sequence classification...")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
print("Model loaded.")

Loading pre-trained BERT model for sequence classification...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.


In [48]:
# 8. Set Training Arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir="./results",             
    do_train=True,                      
    do_eval=True,                       
    learning_rate=2e-5,                 
    per_device_train_batch_size=8,      
    per_device_eval_batch_size=8,       
    num_train_epochs=3,                 
    weight_decay=0.01,                  
    logging_dir='./logs',               
)
print("Training arguments set.")

Setting up training arguments...
Training arguments set.


In [49]:
# 9. Initialized the Trainer
print("\nInitializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
print("Trainer initialized.")


Initializing Trainer...
Trainer initialized.


In [50]:
# 10. Training the Model
print("\nStarting model training...")
trainer.train()
print("Training complete!")


Starting model training...


Step,Training Loss


Training complete!


In [51]:
# 11. Save the Model
print("\nSaving the trained model...")
trainer.save_model("./sentiment_model")
print("Model saved.")


Saving the trained model...
Model saved.


In [52]:
# 12. Evaluate the Model (Optional)
print("\nEvaluating the model on the test dataset...")
results = trainer.evaluate(test_dataset)
print("Evaluation results:", results)


Evaluating the model on the test dataset...


Evaluation results: {'eval_loss': 0.24868842959403992, 'eval_runtime': 0.5142, 'eval_samples_per_second': 38.898, 'eval_steps_per_second': 5.835, 'epoch': 3.0}
