In [1]:
from make_dataset import KaggleMovieReviewsDataset
MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment" 

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# --- 2. Load Dataset ---
# We use the same dataset class because your context logic (adding full sentence) is excellent
print(f"Loading Data for {MODEL_NAME}...")
dataset_tool = KaggleMovieReviewsDataset(
    train_tsv_path='../resources/sentiment-analysis-on-movie-reviews/train.tsv',
    test_tsv_path='../resources/sentiment-analysis-on-movie-reviews/test.tsv',
    train_pct=0.85, 
    seed=42, 
    add_context_columns=True, 
    context_dropout_p=0.2
)
ds_splits = dataset_tool.get_train_datasetdict()
test_dataset = dataset_tool.get_kaggle_test_dataset()

# Helper to swap context column
def use_context(ds):
    if "text_with_context" in ds.column_names:
        return ds.remove_columns([c for c in ["text"] if c in ds.column_names]).rename_column("text_with_context", "text")
    return ds

train_ds = use_context(ds_splits["train"]).shuffle(seed=42)
eval_ds  = use_context(ds_splits["test"])
test_ds = use_context(test_dataset)

# rename label column to labels for Trainer compatibility
train_ds = train_ds.rename_column("label", "labels")
eval_ds = eval_ds.rename_column("label", "labels")

Loading Data for nlptown/bert-base-multilingual-uncased-sentiment...


In [3]:
print(train_ds)
# print(f"Unique full_sentence values: {train_ds.unique('full_sentence')}")
print(f"Count of unique full_sentence values: {len(train_ds.unique('SentenceId'))} vs total dataset size: "
      f"{len(train_ds)} vs number of unique phrase {len(train_ds.unique('PhraseId'))}")

Dataset({
    features: ['PhraseId', 'SentenceId', 'labels', 'full_sentence', 'text'],
    num_rows: 132608
})
Count of unique full_sentence values: 7249 vs total dataset size: 132608 vs number of unique phrase 132608


In [4]:
# for test split now
print(eval_ds)
print(f"Count of unique full_sentence values: {len(eval_ds.unique('SentenceId'))} vs total dataset size: "
	  f"{len(eval_ds)} vs number of unique phrase {len(eval_ds.unique('PhraseId'))}")

Dataset({
    features: ['PhraseId', 'SentenceId', 'labels', 'full_sentence', 'text'],
    num_rows: 23452
})
Count of unique full_sentence values: 1280 vs total dataset size: 23452 vs number of unique phrase 23452


In [5]:
# check for any sentence id that appears in both train and eval
train_sentence_ids = set(train_ds.unique("SentenceId"))
eval_sentence_ids = set(eval_ds.unique("SentenceId"))
overlap_sentence_ids = train_sentence_ids.intersection(eval_sentence_ids)
print(f"Number of overlapping SentenceIds between train and eval: {len(overlap_sentence_ids)}")

Number of overlapping SentenceIds between train and eval: 0


In [6]:
# print an example of train
print("Example from train:")
print(train_ds[0])

Example from train:
{'PhraseId': 90253, 'SentenceId': 4697, 'labels': 3, 'full_sentence': 'Has all the values of a straight-to-video movie , but because it has a bigger-name cast , it gets a full theatrical release .', 'text': 'a bigger-name cast </s> Has all the values of a straight-to-video movie , but because it has a bigger-name cast , it gets a full theatrical release .'}


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [17]:
tokenized_train_ds = train_ds.map(preprocess_function, batched=True)
tokenized_eval_ds = eval_ds.map(preprocess_function, batched=True)
tokenized_test_ds = test_ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 132608/132608 [00:11<00:00, 11249.99 examples/s]
Map: 100%|██████████| 23452/23452 [00:01<00:00, 15168.08 examples/s]
Map: 100%|██████████| 66292/66292 [00:02<00:00, 22886.95 examples/s]


In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

In [12]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=5
)

Loading weights: 100%|██████████| 201/201 [00:00<00:00, 783.02it/s, Materializing param=classifier.weight]                                      


In [14]:
import torch

print("Reseting Model Weights...")
del model
torch.cuda.empty_cache()

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=5
)

training_args = TrainingArguments(
    output_dir="temp",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    warmup_steps=100,
    fp16=False,
	bf16=False,
    weight_decay=0.01,
    max_grad_norm=1.0,
    eval_strategy="steps",
    eval_steps=1200,
    save_strategy="epoch",
    logging_steps=200,
    load_best_model_at_end=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_eval_ds,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Reseting Model Weights...


Loading weights: 100%|██████████| 201/201 [00:00<00:00, 738.97it/s, Materializing param=classifier.weight]                                      


Step,Training Loss,Validation Loss,Accuracy
1200,0.866025,0.889229,0.639732
2400,0.845551,0.869759,0.649369
3600,0.813005,0.859392,0.648559
4800,0.794825,0.876205,0.64067
6000,0.782774,0.830289,0.660455
7200,0.772245,0.82089,0.662161
8400,0.698782,0.857987,0.66425
9600,0.647109,0.869405,0.662459
10800,0.634813,0.863416,0.662204
12000,0.654909,0.855126,0.664293


Writing model shards: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
Writing model shards: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


TrainOutput(global_step=16576, training_loss=0.737772612037806, metrics={'train_runtime': 4210.6324, 'train_samples_per_second': 62.987, 'train_steps_per_second': 3.937, 'total_flos': 1.0620163582296864e+16, 'train_loss': 0.737772612037806, 'epoch': 2.0})

In [15]:
# Check a random training example
import random
idx = random.randint(0, 1000)
sample_input_ids = tokenized_train_ds[idx]['input_ids']
decoded = tokenizer.decode(sample_input_ids)

print(f"Original Label: {tokenized_train_ds[idx]['labels']}")
print(f"Decoded Input:\n{decoded}")

# Check: Do we see the full sentence?
if tokenizer.sep_token in decoded:
    print("\n[PASS] Context separator found! The model is seeing the context.")
else:
    print("\n[FAIL] No separator found. The model might only be seeing the phrase!")

Original Label: 2
Decoded Input:
[CLS] of achronological vignettes < / s > unfolds in a series of achronological vignettes whose cumulative effect is chilling. [SEP]

[PASS] Context separator found! The model is seeing the context.


In [18]:
import numpy as np
import pandas as pd

# 1. Predict on the Test Set
print("Running predictions on test set...")
predictions = trainer.predict(tokenized_test_ds)

# 2. Convert Logits to Class IDs (0,1,2,3,4)
preds = np.argmax(predictions.predictions, axis=1)

# 3. Create Submission DataFrame
submission = pd.DataFrame({
    "PhraseId": test_dataset["PhraseId"],
    "Sentiment": preds
})

# 4. Save
submission.to_csv("submission_bert_668.csv", index=False)
print("Saved submission_bert_668.csv")

Running predictions on test set...


Saved submission_bert_668.csv
