In [None]:
# Instalasi library yang diperlukan dari Hugging Face
!pip install -q transformers ftfy regex accelerate
!pip install --upgrade -q transformers

# Impor library standar dan dari Hugging Face
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import BertTokenizer, ViTImageProcessor, ViTModel, BertModel
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import torch.nn as nn
import torch.nn.functional as F
import ast

# Cek ketersediaan GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# Path ke direktori dataset di Kaggle
data_dir = "/kaggle/input/data-of-multimodal-sarcasm-detection"

# Fungsi untuk memuat data dari file .txt
def load_data_from_txt(filepath):
    records = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                # Menggunakan ast.literal_eval untuk mengubah string menjadi list
                data_list = ast.literal_eval(line.strip())
                tweet_id = data_list[0]
                text = data_list[1]
                label = int(data_list[2])
                records.append({'id': tweet_id, 'text': text, 'sarcasm': label})
            except (ValueError, SyntaxError):
                continue
    return pd.DataFrame(records)

# Memuat data dari masing-masing file
train_df = load_data_from_txt(os.path.join(data_dir, 'text', 'train.txt'))
val_df = load_data_from_txt(os.path.join(data_dir, 'text', 'valid2.txt'))
test_df = load_data_from_txt(os.path.join(data_dir, 'text', 'test2.txt'))

# Membuat path lengkap untuk setiap gambar
image_folder = os.path.join(data_dir, 'dataset_image')
train_df['image_path'] = train_df['id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))
val_df['image_path'] = val_df['id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))
test_df['image_path'] = test_df['id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))

# Memastikan hanya baris dengan gambar yang ada yang diproses
train_df = train_df[train_df['image_path'].apply(os.path.exists)].dropna()
val_df = val_df[val_df['image_path'].apply(os.path.exists)].dropna()
test_df = test_df[test_df['image_path'].apply(os.path.exists)].dropna()

# Mengubah tipe data kolom 'sarcasm' menjadi integer
train_df['sarcasm'] = train_df['sarcasm'].astype(int)
val_df['sarcasm'] = val_df['sarcasm'].astype(int)
test_df['sarcasm'] = test_df['sarcasm'].astype(int)

# Mengubah skenario menjadi 16-shot untuk data latih
sarcastic_samples = train_df[train_df['sarcasm'] == 1].sample(n=16, random_state=42)
non_sarcastic_samples = train_df[train_df['sarcasm'] == 0].sample(n=16, random_state=42)
train_df_16shot = pd.concat([sarcastic_samples, non_sarcastic_samples])

print(f"Ukuran data latih (16-shot): {len(train_df_16shot)}")
print(f"Ukuran data validasi: {len(val_df)}")
print(f"Ukuran data uji: {len(test_df)}")

train_df_16shot.head()

In [None]:
class SarcasmViTBertDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_processor):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.texts = dataframe['text'].tolist()
        self.image_paths = dataframe['image_path'].tolist()
        self.labels = dataframe['sarcasm'].tolist()
        self.max_length = 77 # Panjang token maksimal, sama seperti sebelumnya

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.texts[idx]
        image_path = self.image_paths[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        # Memproses teks dengan BERT Tokenizer
        tokenized_text = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Memproses gambar dengan ViT Image Processor
        image = Image.open(image_path).convert("RGB")
        processed_image = self.image_processor(
            images=image,
            return_tensors="pt"
        )

        return {
            'input_ids': tokenized_text['input_ids'].squeeze(0),
            'attention_mask': tokenized_text['attention_mask'].squeeze(0),
            'pixel_values': processed_image['pixel_values'].squeeze(0),
            'labels': label
        }

In [None]:
class ViTBertSarcasmModel(nn.Module):
    def __init__(self, vit_model_name="google/vit-base-patch16-224-in21k", bert_model_name="bert-base-uncased", fine_tune_pretrained=False):
        super().__init__()
        # 1. Model Vision (ViT)
        self.vit = ViTModel.from_pretrained(vit_model_name)
        # 2. Model Teks (BERT)
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Bekukan (freeze) parameter model pre-trained jika tidak ingin di-fine-tune
        if not fine_tune_pretrained:
            for param in self.vit.parameters():
                param.requires_grad = False
            for param in self.bert.parameters():
                param.requires_grad = False

        # 3. Lapisan Klasifikasi
        # Ukuran fitur gabungan: 768 (dari ViT) + 768 (dari BERT) = 1536
        fusion_dim = self.vit.config.hidden_size + self.bert.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 2) # Output 2 kelas: non-sarkasme, sarkasme
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        # Proses gambar melalui ViT
        # Output ViT memiliki 'last_hidden_state' dan 'pooler_output'
        # Kita ambil representasi [CLS] token dari gambar
        vision_outputs = self.vit(pixel_values=pixel_values)
        image_features = vision_outputs.last_hidden_state[:, 0, :] # Ambil [CLS] token

        # Proses teks melalui BERT
        # Output BERT juga memiliki 'last_hidden_state' dan 'pooler_output'
        # 'pooler_output' adalah representasi [CLS] token yang sudah diproses lebih lanjut
        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.pooler_output

        # Gabungkan (concatenate) fitur dari kedua modalitas
        combined_features = torch.cat((image_features, text_features), dim=1)

        # Lewatkan fitur gabungan ke classifier
        logits = self.classifier(combined_features)
        return logits

In [None]:
# Tentukan nama model pre-trained yang akan digunakan
VIT_MODEL = 'google/vit-base-patch16-224-in21k'
BERT_MODEL = 'bert-base-uncased'

# Inisialisasi tokenizer dan image processor
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
image_processor = ViTImageProcessor.from_pretrained(VIT_MODEL)

# Inisialisasi model dan pindahkan ke GPU
model = ViTBertSarcasmModel(vit_model_name=VIT_MODEL, bert_model_name=BERT_MODEL).to(device)

# Membuat instance Dataset dan DataLoader
train_dataset = SarcasmViTBertDataset(train_df_16shot, tokenizer, image_processor)
val_dataset = SarcasmViTBertDataset(val_df, tokenizer, image_processor)
test_dataset = SarcasmViTBertDataset(test_df, tokenizer, image_processor)

# Ukuran batch
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Optimizer (Hanya akan melatih parameter classifier jika model pre-trained di-freeze)
optimizer = AdamW(model.parameters(), lr=5e-5) # Learning rate umum untuk fine-tuning
criterion = nn.CrossEntropyLoss()

print("Inisialisasi selesai. Siap untuk melatih model.")

In [None]:
# Sel 6: Training and Evaluation Loop
from sklearn.metrics import accuracy_score, f1_score

num_epochs = 100 # Mengurangi jumlah epoch untuk contoh, bisa disesuaikan

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        # Pindahkan data batch ke device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask, pixel_values)
        loss = criterion(outputs, labels)

        # Backward pass dan optimisasi
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Average Training Loss: {avg_train_loss:.4f}")

    # --- Evaluasi pada data validasi ---
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, pixel_values)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    print(f"Validation Accuracy: {acc:.4f} | Validation F1-Score: {f1:.4f}\n")

In [None]:
# Sel 7: Final Evaluation on Test Set & Saving
print("\n--- EVALUASI AKHIR PADA TEST SET ---")
model.eval()
all_test_preds = []
all_test_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing on Test Set"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask, pixel_values)
        preds = torch.argmax(outputs, dim=1)

        all_test_preds.extend(preds.cpu().numpy())
        all_test_labels.extend(labels.cpu().numpy())

final_acc = accuracy_score(all_test_labels, all_test_preds)
final_f1 = f1_score(all_test_labels, all_test_preds)

print(f"\nFinal Test Accuracy (ACC): {final_acc:.4f}")
print(f"Final Test F1-Score: {final_f1:.4f}")

# Simpan hasil ke CSV
results_df = test_df.copy()
results_df['predicted_sarcasm'] = all_test_preds
results_df['true_sarcasm'] = all_test_labels
results_df['final_accuracy'] = final_acc
results_df['final_f1_score'] = final_f1
csv_filename = 'test_results_vit_bert.csv'
results_df.to_csv(csv_filename, index=False)
print(f"\nHasil tes berhasil disimpan ke file: {csv_filename}")

# Simpan model
model_filename = 'vit_bert_sarcasm_model.pth'
torch.save(model.state_dict(), model_filename)
print(f"Model berhasil disimpan ke file: {model_filename}")