In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
root = "/content/drive/MyDrive/processed_acl"

In [3]:
import os
for domain in ["books", "dvd", "electronics", "kitchen"]:
    print(domain, "→", os.listdir(os.path.join(root, domain)))

books → ['negative.review', 'unlabeled.review', 'positive.review']
dvd → ['negative.review', 'unlabeled.review', 'positive.review']
electronics → ['negative.review', 'unlabeled.review', 'positive.review']
kitchen → ['negative.review', 'unlabeled.review', 'positive.review']


In [6]:
import pandas as pd
def load(domain):
    texts, labels = [], []
    domain_path = os.path.join(root, domain)
    with open(os.path.join(domain_path, "positive.review"), encoding="latin-1") as f:
        for line in f:
            texts.append(line.strip())
            labels.append(1)
    with open(os.path.join(domain_path, "negative.review"), encoding="latin-1") as f:
        for line in f:
            texts.append(line.strip())
            labels.append(0)
    return pd.DataFrame({
        "text": texts,
        "label": labels
    })

In [7]:
books_df = load("books")
print(books_df.shape)
books_df.head()

(2000, 2)


Unnamed: 0,text,label
0,holes:1 must:1 top_secret:1 he:1 center:1 othe...,1
1,i_think:1 dr_dean:1 reason:1 oz:2 medicine_whi...,1
2,woman_the:1 contains_the:1 fan_i:1 alex_ross(s...,1
3,hurricane:1 these_pages:1 lost_innocence:1 bot...,1
4,while:1 commented:1 the_rise:2 if:2 strong_emp...,1


In [8]:
!pip install transformers torch scikit-learn pandas tqdm



In [9]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer, BertModel,
    XLNetTokenizer, XLNetModel
)
from sklearn.metrics import accuracy_score

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [11]:
def preprocess(df):
    df = df.copy()
    df["text"] = df["text"].astype(str).str.strip()
    df = df[df["text"] != ""]
    return df

In [12]:
books_df = preprocess(load("books"))
dvd_df = preprocess(load("dvd"))
electronics_df = preprocess(load("electronics"))
kitchen_df = preprocess(load("kitchen"))
print(books_df.shape)
print(dvd_df.shape)
print(electronics_df.shape)
print(kitchen_df.shape)

(2000, 2)
(2000, 2)
(2000, 2)
(2000, 2)


In [13]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
MAX_LEN = 256

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [14]:
class Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, index):
        enc = self.tokenizer(
            self.texts[index],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[index], dtype=torch.long)
        }


In [15]:
class BertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]
        return self.classifier(cls)

In [16]:
class XLNetClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.xlnet = XLNetModel.from_pretrained("xlnet-base-cased")
        self.classifier = nn.Linear(self.xlnet.config.hidden_size, 2)
    def forward(self, input_ids, attention_mask):
        out = self.xlnet(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, -1]
        return self.classifier(cls)

In [17]:
def train(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        outputs = model(
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device)
        )
        loss = loss_fn(outputs, batch["label"].to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [18]:
from sklearn.metrics import accuracy_score
def evaluate(model, loader, device):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in loader:
            outputs = model(
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device)
            )
            preds.extend(outputs.argmax(1).cpu().numpy())
            labels.extend(batch["label"].numpy())
    return accuracy_score(labels, preds)

In [19]:
from torch.utils.data import DataLoader
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
domains = {
    "books": books_df,
    "dvd": dvd_df,
    "electronics": electronics_df,
    "kitchen": kitchen_df
}
EPOCHS = 3
BATCH_SIZE = 16
results = []

In [20]:
for src_name, src_df in domains.items():
    for tgt_name, tgt_df in domains.items():
        if src_name == tgt_name:
            continue
        print(f"\nSOURCE: {src_name} → TARGET: {tgt_name}")
        train_ds = Dataset(
            src_df["text"].tolist(),
            src_df["label"].tolist(),
            bert_tokenizer
        )
        test_ds = Dataset(
            tgt_df["text"].tolist(),
            tgt_df["label"].tolist(),
            bert_tokenizer
        )
        train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
        test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
        bert_model = BertClassifier().to(device)
        optimizer = torch.optim.AdamW(bert_model.parameters(), lr=2e-5)
        loss_fn = nn.CrossEntropyLoss()
        for _ in range(EPOCHS):
            train(bert_model, train_dl, optimizer, loss_fn, device)
        bert_acc = evaluate(bert_model, test_dl, device)
        train_ds = Dataset(
            src_df["text"].tolist(),
            src_df["label"].tolist(),
            xlnet_tokenizer
        )
        test_ds = Dataset(
            tgt_df["text"].tolist(),
            tgt_df["label"].tolist(),
            xlnet_tokenizer
        )
        train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
        test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
        xlnet_model = XLNetClassifier().to(device)
        optimizer = torch.optim.AdamW(xlnet_model.parameters(), lr=2e-5)
        for _ in range(EPOCHS):
            train(xlnet_model, train_dl, optimizer, loss_fn, device)
        xlnet_acc = evaluate(xlnet_model, test_dl, device)
        results.append([
            src_name, tgt_name,
            round(bert_acc * 100, 2),
            round(xlnet_acc * 100, 2)
        ])


SOURCE: books → TARGET: dvd


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]


SOURCE: books → TARGET: electronics

SOURCE: books → TARGET: kitchen

SOURCE: dvd → TARGET: books

SOURCE: dvd → TARGET: electronics

SOURCE: dvd → TARGET: kitchen

SOURCE: electronics → TARGET: books

SOURCE: electronics → TARGET: dvd

SOURCE: electronics → TARGET: kitchen

SOURCE: kitchen → TARGET: books

SOURCE: kitchen → TARGET: dvd

SOURCE: kitchen → TARGET: electronics


In [21]:
results_df = pd.DataFrame(
    results,
    columns=["Source", "Target", "BERT (%)", "XLNet (%)"]
)
results_df

Unnamed: 0,Source,Target,BERT (%),XLNet (%)
0,books,dvd,80.45,50.0
1,books,electronics,78.35,79.85
2,books,kitchen,80.8,78.85
3,dvd,books,76.6,79.7
4,dvd,electronics,72.55,50.0
5,dvd,kitchen,74.35,82.45
6,electronics,books,75.0,75.85
7,electronics,dvd,76.65,50.0
8,electronics,kitchen,87.35,87.0
9,kitchen,books,73.95,71.3


In [22]:
results_df[["BERT (%)", "XLNet (%)"]].mean()

Unnamed: 0,0
BERT (%),78.033333
XLNet (%),67.083333
