In [1]:
import pandas as pd
from transformers import BertTokenizer

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df["text"].tolist(), df["label"].tolist()

def tokenize_texts(texts, tokenizer, max_len=128):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )


# New Section

In [2]:
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [3]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd

In [6]:
def load_data(file_path):
    """
    Load text and labels from a CSV file.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    df = pd.read_csv(file_path)
    if "text" not in df.columns or "label" not in df.columns:
        raise ValueError("CSV file must contain 'text' and 'label' columns.")
    return df["text"].tolist(), df["label"].tolist()

def split_data(texts, labels, test_size=0.2, random_state=42):
    """
    Split the dataset into training and validation sets.
    """
    return train_test_split(
        texts, labels, test_size=test_size, random_state=random_state
    )

def save_data(file_path, texts, labels):
    """
    Save processed data to a CSV file.
    """
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    data = {"text": [text.strip() for text in texts], "label": labels}
    df = pd.DataFrame(data)
    df.to_csv(file_path, index=False)
    print(f"Data saved to {file_path}")

if __name__ == "__main__":
    # Input and output paths
    INPUT_PATH = "/content/train.csv"  # Input raw data path
    OUTPUT_TRAIN_PATH = "/content/processed_train.csv"
    OUTPUT_VAL_PATH = "/content/processed_val.csv"

    # Load and preprocess the data
    texts, labels = load_data(INPUT_PATH)
    train_texts, val_texts, train_labels, val_labels = split_data(texts, labels)

    # Save processed train and validation data
    save_data(OUTPUT_TRAIN_PATH, train_texts, train_labels)
    save_data(OUTPUT_VAL_PATH, val_texts, val_labels)

Data saved to /content/processed_train.csv
Data saved to /content/processed_val.csv


In [7]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors="pt"
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

In [8]:
def train_model(train_data, val_data, model, tokenizer, epochs=3, batch_size=16):
    from torch.utils.data import DataLoader
    from torch.optim import AdamW

    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {key: val.to("cuda") for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to("cuda")
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1}: Loss = {loss.item()}")

    torch.save(model.state_dict(), "/content/sarcasm_model.pt")

def main():
    import torch
    from transformers import BertTokenizer, BertForSequenceClassification
    from torch.utils.data import DataLoader
    from torch.optim import AdamW
    from sklearn.model_selection import train_test_split

    # Load data
    texts, labels = load_data("/content/train.csv")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2
    )

    train_data = SarcasmDataset(train_texts, train_labels, tokenizer)
    val_data = SarcasmDataset(val_texts, val_labels, tokenizer)

    # Load model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to("cuda")

    # Train
    train_model(train_data, val_data, model, tokenizer)

if __name__ == "__main__":
    main()
import torch
from transformers import BertTokenizer, BertForSequenceClassification

def load_model():
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model.load_state_dict(torch.load("../content/sarcasm_model.pt"))
    model.eval()
    return model

def predict(texts, model, tokenizer):
    # Ensure texts is a list, even for single input
    if isinstance(texts, str):
        texts = [texts]

    # Tokenize the input texts
    encodings = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    )

    # Move the tensors to the same device as the model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = {key: val.to(device) for key, val in encodings.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=1)

    # Map predictions to human-readable labels
    label_mapping = {0: "not sarcasm", 1: "sarcasm"}  # Adjust based on your dataset
    labeled_predictions = [label_mapping[pred.item()] for pred in predictions]

    return labeled_predictions

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Loss = 0.21168480813503265
Epoch 2: Loss = 0.13522295653820038
Epoch 3: Loss = 0.05028877779841423


In [10]:
if __name__ == "__main__":
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = load_model()
    texts = input("Enter the text to predict: ")
    predictions = predict(texts, model, tokenizer)
    for ouput in predictions:
        print("Output:", ouput)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter the text to predict: "Wow, this is the best charger ever! It broke in two days.
Output: sarcasm
