# Transformer Baseline with Context and Target Grouping

Implementing transformer-based approach based on winning solutions:
- Model: BERT-for-patents (domain-specific)
- Input: anchor[SEP]target[SEP]context[SEP]targets
- Validation: GroupKFold grouping by anchor
- Loss: Pearson correlation
- Features: Target grouping (the "magic" from winning solutions)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTrain columns: {train_df.columns.tolist()}")
print(f"Score distribution:\n{train_df['score'].value_counts().sort_index()}")

In [None]:
# Create target grouping - the "magic" feature from winning solutions
# Group targets by anchor and context

def create_target_groups(df, train_df_full):
    """Create grouped targets for each row"""
    # Group by anchor and context
    groups = train_df_full.groupby(['anchor', 'context'])['target'].apply(list).to_dict()
    
    def get_targets(row):
        key = (row['anchor'], row['context'])
        if key in groups:
            targets = groups[key].copy()
            # Remove current target from the list
            if row['target'] in targets:
                targets.remove(row['target'])
            # Return comma-separated targets (up to 10 to avoid too long sequences)
            return '; '.join(targets[:10])
        return ''
    
    return df.apply(get_targets, axis=1)

# Create target groups for training data
print("Creating target groups...")
train_df['targets'] = create_target_groups(train_df, train_df)

# For test data, we need to use both train and test to create groups
# (as mentioned in 2nd place solution: "During inference, the group is performed after concatenating train-set and test-set")
print("Creating target groups for test data...")
combined_df = pd.concat([train_df[['anchor', 'target', 'context']], test_df], ignore_index=True)
test_df['targets'] = create_target_groups(test_df, combined_df)

print(f"\nExample of target grouping:")
print(train_df[['anchor', 'target', 'context', 'targets']].head(3))

In [None]:
# Load CPC text descriptions for context
# Create a simple mapping from context codes to text descriptions
# In practice, you'd load this from the CPC documentation, but we'll create a simple version

# Sample CPC descriptions (simplified - in real solution you'd use full CPC hierarchy)
cpc_descriptions = {
    'A01': 'Agriculture; forestry; animal husbandry; hunting; trapping; fishing',
    'A61': 'Medical or veterinary science; hygiene',
    'B01': 'Physical or chemical processes or apparatus in general',
    'B02': 'Crushing, pulverising, or disintegrating; preparing grain for milling',
    'B23': 'Machine tools; metal-working not otherwise provided for',
    'B29': 'Working of plastics; working of substances in a plastic state',
    'B60': 'Vehicles in general',
    'B65': 'Conveying; packing; storing; handling thin or filamentary material',
    'C01': 'Inorganic chemistry',
    'C07': 'Organic chemistry',
    'C08': 'Organic macromolecular compounds',
    'C12': 'Biochemistry; beer; spirits; wine; vinegar; microbiology; enzymology',
    'D01': 'Natural or artificial threads or fibres; spinning',
    'D06': 'Treatment of textiles or the like; laundering; flexible materials',
    'E02': 'Hydraulic engineering; foundations; soil-shifting',
    'E04': 'Building',
    'E05': 'Locks; keys; window or door fittings; safes',
    'E21': 'Earth drilling; mining',
    'F01': 'Machines or engines in general; engine plants in general',
    'F16': 'Engineering elements or units; general measures for producing',
    'F21': 'Lighting',
    'F23': 'Combustion apparatus; combustion processes',
    'F24': 'Heating; ranges; ventilating',
    'G01': 'Measuring; testing',
    'G02': 'Optics',
    'G03': 'Photography; cinematography; electrography; holography',
    'G04': 'Horology',
    'G05': 'Controlling; regulating',
    'G06': 'Computing; calculating; counting',
    'G07': 'Checking-devices',
    'G08': 'Signalling',
    'G09': 'Educating; cryptography; display; advertising; seals',
    'G10': 'Musical instruments; acoustics',
    'G11': 'Information storage',
    'G12': 'Instrument details',
    'G16': 'Information and communication technology [ICT] specially adapted',
    'G21': 'Nuclear physics; nuclear engineering',
    'H01': 'Basic electric elements',
    'H02': 'Generation, conversion, or distribution of electric power',
    'H03': 'Basic electronic circuitry',
    'H04': 'Electric communication technique',
    'H05': 'Electric techniques not otherwise provided for'
}

def get_cpc_text(context):
    """Get CPC text description for context"""
    # Extract first 3 characters (sector + class)
    key = context[:3]
    return cpc_descriptions.get(key, f'Patent classification {context}')

# Apply to data
train_df['context_text'] = train_df['context'].apply(get_cpc_text)
test_df['context_text'] = test_df['context'].apply(get_cpc_text)

print(f"\nContext text examples:")
print(train_df[['context', 'context_text']].drop_duplicates().head(5))

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTrain columns: {train_df.columns.tolist()}")
print(f"Score distribution:\n{train_df['score'].value_counts().sort_index()}")

GPU available: False
Train shape: (36473, 5)
Test shape: (36, 4)

Train columns: ['id', 'anchor', 'target', 'context', 'score']
Score distribution:
score
0.00     7471
0.25    11519
0.50    12300
0.75     4029
1.00     1154
Name: count, dtype: int64


In [None]:
# Define Dataset class
class PatentDataset(Dataset):
    def __init__(self, texts, scores, tokenizer, max_length=128):
        self.texts = texts
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        score = float(self.scores[idx]) if self.scores is not None else 0.0
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'score': torch.tensor(score, dtype=torch.float)
        }

# Define Model class
class PatentModel(nn.Module):
    def __init__(self, model_name, dropout=0.1):
        super(PatentModel, self).__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        output = self.classifier(pooled_output)
        return output.squeeze(-1)

# Pearson correlation loss
def pearson_corr_loss(pred, target):
    """Pearson correlation loss (maximize correlation)"""
    pred_mean = pred.mean()
    target_mean = target.mean()
    
    pred_centered = pred - pred_mean
    target_centered = target - target_mean
    
    covariance = (pred_centered * target_centered).sum()
    pred_std = torch.sqrt((pred_centered ** 2).sum())
    target_std = torch.sqrt((target_centered ** 2).sum())
    
    # Add small epsilon to avoid division by zero
    correlation = covariance / (pred_std * target_std + 1e-8)
    
    # Return negative correlation (since we want to maximize it)
    return -correlation

print("Model components defined successfully!")

In [None]:
# Initialize tokenizer and model
MODEL_NAME = "anferico/bert-for-patents"  # Domain-specific model, smaller than DeBERTa-v3-large

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {MODEL_NAME}")

# Check max token length
max_tokens = max(tokenizer(text, return_length=True).length for text in train_df['input_text'].head(1000))
print(f"Max tokens in sample: {max_tokens}")
MAX_LENGTH = min(128, max_tokens + 10)  # Use 128 as in winning solutions
print(f"Using MAX_LENGTH: {MAX_LENGTH}")

In [None]:
# Prepare GroupKFold validation
gkf = GroupKFold(n_splits=5)
groups = train_df['anchor']  # Group by anchor to prevent leakage

fold_scores = []
all_predictions = np.zeros(len(train_df))

print("Starting 5-fold GroupKFold validation...")

for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=groups)):
    print(f"\n{'='*50}")
    print(f"Fold {fold + 1}/5")
    print(f"{'='*50}")
    
    # Split data
    train_fold = train_df.iloc[train_idx]
    val_fold = train_df.iloc[val_idx]
    
    print(f"Train size: {len(train_fold)}, Val size: {len(val_fold)}")
    
    # Create datasets
    train_dataset = PatentDataset(
        train_fold['input_text'].values,
        train_fold['score'].values,
        tokenizer,
        MAX_LENGTH
    )
    
    val_dataset = PatentDataset(
        val_fold['input_text'].values,
        val_fold['score'].values,
        tokenizer,
        MAX_LENGTH
    )
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=0)
    
    # Initialize model
    model = PatentModel(MODEL_NAME)
    model = model.cuda() if torch.cuda.is_available() else model
    
    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_epochs = 3  # Start with 3 epochs for baseline
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)
    
    # Training loop
    best_val_loss = float('inf')
    patience_counter = 0
    patience = 2
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        # Training
        model.train()
        train_losses = []
        
        for batch in train_loader:
            input_ids = batch['input_ids'].cuda() if torch.cuda.is_available() else batch['input_ids']
            attention_mask = batch['attention_mask'].cuda() if torch.cuda.is_available() else batch['attention_mask']
            scores = batch['score'].cuda() if torch.cuda.is_available() else batch['score']
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = pearson_corr_loss(outputs, scores)
            
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        print(f"Train Loss: {avg_train_loss:.4f}")
        
        # Validation
        model.eval()
        val_predictions = []
        val_targets = []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].cuda() if torch.cuda.is_available() else batch['input_ids']
                attention_mask = batch['attention_mask'].cuda() if torch.cuda.is_available() else batch['attention_mask']
                scores = batch['score'].cuda() if torch.cuda.is_available() else batch['score']
                
                outputs = model(input_ids, attention_mask)
                
                val_predictions.extend(outputs.cpu().numpy())
                val_targets.extend(scores.cpu().numpy())
        
        # Calculate Pearson correlation
        val_predictions = np.array(val_predictions)
        val_targets = np.array(val_targets)
        
        # Clip predictions to valid range
        val_predictions = np.clip(val_predictions, 0, 1)
        
        correlation = np.corrcoef(val_targets, val_predictions)[0, 1]
        fold_scores.append(correlation)
        
        print(f"Val Pearson Correlation: {correlation:.4f}")
        
        # Store predictions for this fold
        all_predictions[val_idx] = val_predictions
        
        # Early stopping check
        val_loss = -correlation  # Our loss is negative correlation
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break
    
    # Clear memory
    del model, optimizer, scheduler
    torch.cuda.empty_cache()

print(f"\n{'='*50}")
print(f"CV Results: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")
print(f"{'='*50}")

In [None]:
# Calculate overall Pearson correlation
overall_correlation = np.corrcoef(train_df['score'].values, all_predictions)[0, 1]
print(f"Overall Pearson Correlation: {overall_correlation:.4f}")

# Add predictions to dataframe for analysis
train_df['predictions'] = all_predictions

print(f"\nPrediction distribution:")
print(pd.Series(all_predictions).describe())

print(f"\nScore vs Prediction correlation by context:")
correlations_by_context = train_df.groupby('context').apply(
    lambda x: np.corrcoef(x['score'], x['predictions'])[0, 1] if len(x) > 1 else np.nan
)
print(correlations_by_context.describe())