In [49]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def load_data(file_path: str) -> pd.DataFrame:
    return pd.read_csv(file_path,engine="python")

def basic_cleaning(data: pd.DataFrame) -> pd.DataFrame:
    data["review"] = data["review"].str.lower()
    data["review"] = data["review"].str.replace(r"<.*?>", "", regex=True)
    data["review"] = data["review"].str.replace(r"http\S+", "", regex=True)
    data["review"] = data["review"].str.replace(r"\s+", " ", regex=True).str.strip()
    return data

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    return basic_cleaning(data)

def encode_data(y_train, y_test):
    encoder = LabelEncoder()
    y_train_enc = encoder.fit_transform(y_train)
    y_test_enc = encoder.transform(y_test)
    return y_train_enc, y_test_enc




**Train**

In [51]:

from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments,DataCollatorWithPadding
import pandas as pd
from datasets import Dataset
import os
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model_name = "bert-base-uncased"
model  = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2)

model.config

for param in model.base_model.parameters():
  param.requires_grad = False

for param in model.classifier.parameters():
  param.requires_grad = True
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

data_collator = DataCollatorWithPadding(tokenizer)

def train(train,test):
  training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/bert_checkpoints",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_strategy="epoch",
    logging_steps=50,
    num_train_epochs=3,
    report_to="none",

)
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  checkpoint_dir = "/content/drive/MyDrive/bert_checkpoints"


  if os.path.exists(checkpoint_dir) and any(os.scandir(checkpoint_dir)):

      trainer.train(resume_from_checkpoint=checkpoint_dir)
  else:
      trainer.train()

  trainer.save_model("/content/drive/MyDrive/bert_final_model")
  trainer.tokenizer.save_pretrained("/content/drive/MyDrive/bert_final_model")




def build_dataset(texts, labels):
    ds = Dataset.from_dict({
        "text": texts.tolist(),
        "labels": labels
    })

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding=False,
            max_length=128
        )

    ds = ds.map(tokenize, batched=True)
    ds = ds.remove_columns(["text"])
    ds.set_format("torch")

    return ds









Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters: 1538


In [None]:
# from src.preprocessing import (
#     load_data,
#     preprocess_data,
#     vectorize_data,
#     encode_data
# )
# from src.train import tokenize_text
from sklearn.model_selection import train_test_split

def main():
    file_path = "/content/drive/MyDrive/nlp-sentiment-analysis/data/IMDB_Dataset.csv"
    data = load_data(file_path)
    data = preprocess_data(data)

    X = data["review"]
    y = data["sentiment"]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.20,
        random_state=42,
        shuffle=True,
        stratify=y
    )


    y_train_enc, y_test_enc = encode_data(y_train, y_test)
    train_ds = build_dataset(X_train, y_train_enc)
    test_ds  = build_dataset(X_test, y_test_enc)

    train(train_ds,test_ds)



if __name__ == "__main__":
    main()


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
