In [1]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer, AlbertForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.auto import tqdm

In [2]:
# -----------------------------------
# 1. Data Loading and Preparation
# -----------------------------------

# Paths to the source and target domain datasets
source_train_path = '/kaggle/input/sentiment-analysis/DVD11.csv'  # Source domain (e.g., DVD reviews)
target_train_path = '/kaggle/input/sentiment-analysis/Books11.csv'  # Target domain (e.g., Book reviews)
target_test_path = '/kaggle/input/sentiment-analysis/Books11.csv'  # Assuming same file for simplicity; adjust as needed

# Load Source Domain Data
source_train_data = pd.read_csv(source_train_path, nrows=20000)
print(f"Source Training Data Shape: {source_train_data.shape}")

# Load Target Domain Training Data
target_train_data = pd.read_csv(target_train_path, nrows=20000)
print(f"Target Training Data Shape: {target_train_data.shape}")

# Load Target Domain Test Data
target_test_data = pd.read_csv(target_test_path, nrows=20000)
print(f"Target Test Data Shape: {target_test_data.shape}")

Source Training Data Shape: (12450, 2)
Target Training Data Shape: (20000, 2)
Target Test Data Shape: (20000, 2)


In [3]:

# -----------------------------------
# 2. Text Cleaning and Preprocessing
# -----------------------------------

def clean_text(text, domain):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        
        # Domain-specific cleaning rules
        if domain == 'books':
            text = re.sub(r'\bthriller\b', 'genre_thriller', text)
            # Add more book-specific cleaning rules if necessary
        elif domain == 'dvd':
            text = re.sub(r'\baction\b', 'genre_action', text)
            # Add more DVD-specific cleaning rules if necessary
        
        return text
    return None

In [4]:
# Apply text cleaning to Source Domain
source_train_data['cleaned_review'] = source_train_data['review_body'].apply(lambda x: clean_text(x, 'dvd'))
source_train_data = source_train_data.dropna(subset=['cleaned_review'])
print(f"Source Training Data after Cleaning: {source_train_data.shape}")

# Apply text cleaning to Target Domain Training Data
target_train_data['cleaned_review'] = target_train_data['review_body'].apply(lambda x: clean_text(x, 'books'))
target_train_data = target_train_data.dropna(subset=['cleaned_review'])
print(f"Target Training Data after Cleaning: {target_train_data.shape}")

# Apply text cleaning to Target Domain Test Data
target_test_data['cleaned_review'] = target_test_data['review_body'].apply(lambda x: clean_text(x, 'books'))
target_test_data = target_test_data.dropna(subset=['cleaned_review'])
print(f"Target Test Data after Cleaning: {target_test_data.shape}")

Source Training Data after Cleaning: (12446, 3)
Target Training Data after Cleaning: (20000, 3)
Target Test Data after Cleaning: (20000, 3)


In [5]:
# -----------------------------------
# 3. Tokenization
# -----------------------------------

# Initialize BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

# Define domain-specific tokens to better capture unique terms
special_tokens = ['genre_thriller', 'genre_action']
num_added_tokens = bert_tokenizer.add_tokens(special_tokens)
print(f"Added {num_added_tokens} special tokens to the tokenizer.")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Added 2 special tokens to the tokenizer.




In [6]:
# Define tokenization function
max_length = 128  # Maximum sequence length

def encode_review_bert(text):
    return bert_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

In [7]:
# -----------------------------------
# 4. Model Initialization
# -----------------------------------

print("Initializing ALBERT Model for Sequence Classification...")
# Load pre-trained ALBERT configuration
config = AlbertForSequenceClassification.from_pretrained('albert-base-v2').config
config.num_labels = 2  # Adjust based on your classification task

# Initialize ALBERT model with the updated configuration
albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=config)

# Resize model embeddings to accommodate new tokens
albert_model.resize_token_embeddings(len(bert_tokenizer))
print(f"Resized token embeddings to {len(bert_tokenizer)} tokens.")

Initializing ALBERT Model for Sequence Classification...


config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Resized token embeddings to 28998 tokens.


In [8]:
# Move the model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
albert_model.to(device)

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(28998, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [9]:
# -----------------------------------
# 5. Creating Datasets and DataLoaders
# -----------------------------------

def prepare_dataset(data, tokenizer):
    input_ids = []
    attention_masks = []
    
    for review in data['cleaned_review']:
        encoded_review = encode_review_bert(review)
        input_ids.append(encoded_review['input_ids'])
        attention_masks.append(encoded_review['attention_mask'])
    
    # Convert lists to tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(data['star_rating'].values)
    
    return input_ids, attention_masks, labels

In [10]:
# Prepare Source Domain Tensors
source_input_ids, source_attention_masks, source_labels = prepare_dataset(source_train_data, bert_tokenizer)

# Prepare Target Domain Training Tensors
target_train_input_ids, target_train_attention_masks, target_train_labels = prepare_dataset(target_train_data, bert_tokenizer)

# Prepare Target Domain Test Tensors
target_test_input_ids, target_test_attention_masks, target_test_labels = prepare_dataset(target_test_data, bert_tokenizer)

In [11]:
# Split Source Data into Training and Validation
source_train_inputs, source_val_inputs, source_train_masks, source_val_masks, source_train_labels, source_val_labels = train_test_split(
    source_input_ids, source_attention_masks, source_labels, test_size=0.1, random_state=42
)

# Split Target Data into Training and Validation
target_train_inputs, target_val_inputs, target_train_masks, target_val_masks, target_train_labels, target_val_labels = train_test_split(
    target_train_input_ids, target_train_attention_masks, target_train_labels, test_size=0.1, random_state=42
)

In [12]:
# Create DataLoaders for Source Domain
batch_size = 16

source_train_dataset = TensorDataset(source_train_inputs, source_train_masks, source_train_labels)
source_train_dataloader = DataLoader(source_train_dataset, batch_size=batch_size, shuffle=True)

source_val_dataset = TensorDataset(source_val_inputs, source_val_masks, source_val_labels)
source_val_dataloader = DataLoader(source_val_dataset, batch_size=batch_size)

In [13]:
# Create DataLoaders for Target Domain
target_train_dataset = TensorDataset(target_train_inputs, target_train_masks, target_train_labels)
target_train_dataloader = DataLoader(target_train_dataset, batch_size=batch_size, shuffle=True)

target_val_dataset = TensorDataset(target_val_inputs, target_val_masks, target_val_labels)
target_val_dataloader = DataLoader(target_val_dataset, batch_size=batch_size)

In [14]:
# Create DataLoader for Target Test Set
target_test_dataset = TensorDataset(target_test_input_ids, target_test_attention_masks, target_test_labels)
target_test_dataloader = DataLoader(target_test_dataset, batch_size=batch_size)

In [15]:
# -----------------------------------
# 6. Training Loop
# -----------------------------------

# Define optimizer
optimizer = AdamW(albert_model.parameters(), lr=2e-5)

# Calculate total training steps for both source and target domains
epochs = 5
total_steps = epochs * (len(source_train_dataloader) + len(target_train_dataloader))



In [16]:
# Define scheduler
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [17]:
def evaluate(dataloader, domain_name):
    albert_model.eval()
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        batch_input_ids, batch_masks, batch_labels = [b.to(device) for b in batch]
        with torch.no_grad():
            outputs = albert_model(batch_input_ids, attention_mask=batch_masks)
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch_labels.cpu().numpy()
        
        all_preds.extend(preds)
        all_labels.extend(labels)
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')
    f1 = f1_score(all_labels, all_preds, average='binary')
    
    print(f"{domain_name} Domain - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")


In [18]:
# Initialize progress bar
progress_bar = tqdm(range(total_steps))

# Training loop with Domain Adaptation
for epoch in range(epochs):
    albert_model.train()
    
    # Create iterators for source and target DataLoaders
    source_iter = iter(source_train_dataloader)
    target_iter = iter(target_train_dataloader)
    
    # Determine the number of steps based on the larger DataLoader
    steps = max(len(source_train_dataloader), len(target_train_dataloader))
    
    for _ in range(steps):
        loss_src, loss_tgt = None, None
        
        # Training on Source Domain
        try:
            src_batch = next(source_iter)
            src_input_ids, src_masks, src_labels = [b.to(device) for b in src_batch]
            
            # Forward pass
            outputs = albert_model(src_input_ids, attention_mask=src_masks, labels=src_labels)
            loss_src = outputs.loss
            loss_src.backward()
        except StopIteration:
            pass  # No more batches in source DataLoader
        
        # Training on Target Domain
        try:
            tgt_batch = next(target_iter)
            tgt_input_ids, tgt_masks, tgt_labels = [b.to(device) for b in tgt_batch]
            
            # Forward pass
            outputs = albert_model(tgt_input_ids, attention_mask=tgt_masks, labels=tgt_labels)
            loss_tgt = outputs.loss
            loss_tgt.backward()
        except StopIteration:
            pass  # No more batches in target DataLoader
        
        # Optional: Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(albert_model.parameters(), max_norm=1.0)
        
        # Optimize
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    # Print the loss from the last step of the epoch
    print(f"\nEpoch {epoch + 1}/{epochs} completed.")
    if loss_src is not None:
        print(f"Source Loss: {loss_src.item()}")
    if loss_tgt is not None:
        print(f"Target Loss: {loss_tgt.item()}")
    
    # Optional: Evaluate after each epoch
    evaluate(source_val_dataloader, "Source")
    evaluate(target_val_dataloader, "Target")

  0%|          | 0/9130 [00:00<?, ?it/s]


Epoch 1/5 completed.
Target Loss: 0.4113863706588745
Source Domain - Accuracy: 0.88, Precision: 0.88, Recall: 1.00, F1-score: 0.94
Target Domain - Accuracy: 0.88, Precision: 0.88, Recall: 1.00, F1-score: 0.94

Epoch 2/5 completed.
Target Loss: 0.44094905257225037
Source Domain - Accuracy: 0.89, Precision: 0.90, Recall: 0.98, F1-score: 0.94
Target Domain - Accuracy: 0.88, Precision: 0.89, Recall: 0.99, F1-score: 0.94

Epoch 3/5 completed.
Target Loss: 0.3363320827484131
Source Domain - Accuracy: 0.91, Precision: 0.93, Recall: 0.97, F1-score: 0.95
Target Domain - Accuracy: 0.90, Precision: 0.91, Recall: 0.99, F1-score: 0.95

Epoch 4/5 completed.
Target Loss: 0.17345066368579865
Source Domain - Accuracy: 0.91, Precision: 0.92, Recall: 0.98, F1-score: 0.95
Target Domain - Accuracy: 0.90, Precision: 0.90, Recall: 0.99, F1-score: 0.94

Epoch 5/5 completed.
Target Loss: 0.21406511962413788
Source Domain - Accuracy: 0.91, Precision: 0.94, Recall: 0.95, F1-score: 0.95
Target Domain - Accuracy:

In [19]:
# -----------------------------------
# 7. Evaluation on Target Test Set
# -----------------------------------

evaluate(target_test_dataloader, "Target Test")

# -----------------------------------
# 8. Saving the Model
# -----------------------------------

# Define save directory
save_directory = './fine_tuned_albert_sentiment_model_domain_adapted'

# Save the model
albert_model.save_pretrained(save_directory)

# Save the tokenizer
bert_tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Target Test Domain - Accuracy: 0.94, Precision: 0.97, Recall: 0.96, F1-score: 0.97
Model and tokenizer saved to ./fine_tuned_albert_sentiment_model_domain_adapted
