In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

dataset_path = '/kaggle/input/identify-the-author/train/train.csv'
df = pd.read_csv(dataset_path)
author_map = {
    'EAP': 0,
    'HPL': 1,
    'MWS': 2
}

df['author'] = df['author'].map(author_map)
df

/kaggle/input/identify-the-author/sample_submission/sample_submission.csv
/kaggle/input/identify-the-author/test/test.csv
/kaggle/input/identify-the-author/train/train.csv
/kaggle/input/roberta-tf-1000/pytorch/default/1/best_model-r-1000-lower-reg.pt
/kaggle/input/model1000/best_model.pt


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",0
1,id17569,It never once occurred to me that the fumbling...,1
2,id11008,"In his left hand was a gold snuff box, from wh...",0
3,id27763,How lovely is spring As we looked from Windsor...,2
4,id12958,"Finding nothing else, not even gold, the Super...",1
...,...,...,...
19574,id17718,"I could have fancied, while I looked at it, th...",0
19575,id08973,The lids clenched themselves together as if in...,0
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",0
19577,id17513,"For an item of news like this, it strikes us i...",0


In [4]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from tqdm import tqdm

2025-07-21 16:01:59.814797: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753113719.838813     281 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753113719.845633     281 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
label_map = {author: i for i, author in enumerate(df['author'].unique())}
df['label'] = df['author'].map(label_map)

texts = df['text'].tolist()
labels = df['label'].tolist()

In [6]:
# Split for training/testing
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, stratify=labels, random_state=22)

# Fit TF-IDF on train, transform both
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Reduce dimensionality
# svd = TruncatedSVD(n_components=100, random_state=42)
# X_train_svd = svd.fit_transform(X_train_tfidf)
# X_test_svd = svd.transform(X_test_tfidf)


In [7]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize(texts, tokenizer, max_length=256):
    encodings = tokenizer(
        texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt'
    )
    return encodings

train_encodings = tokenize(X_train, tokenizer)
test_encodings = tokenize(X_test, tokenizer)


In [8]:
class HybridDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, tfidf_dense, labels, texts):
        self.encodings = encodings
        self.tfidf_dense = torch.tensor(tfidf_dense, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.texts = texts
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['tfidf_dense'] = self.tfidf_dense[idx]
        item['labels'] = self.labels[idx]
        item['texts'] = self.texts[idx]
        return item
        
X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()

train_dataset = HybridDataset(train_encodings, X_train_tfidf_dense, y_train, X_train)
test_dataset = HybridDataset(test_encodings, X_test_tfidf_dense, y_test, X_test)


In [9]:
class HybridClassifier(nn.Module):
    def __init__(self, tfidf_dim, num_labels):
        super().__init__()
        self.config = RobertaConfig.from_pretrained('roberta-base', 
                                                    hidden_dropout_prob=0.15, 
                                                    attention_probs_dropout_prob=0.05)
        self.transformer = RobertaModel.from_pretrained('roberta-base', config = self.config)
        self.hidden_size = self.transformer.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_size + tfidf_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),  # buffer layer
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 3) 
        )
    def forward(self, input_ids, attention_mask, tfidf_dense):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        combined = torch.cat((cls_emb, tfidf_dense), dim=1)
        logits = self.classifier(combined)
        return logits


In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridClassifier(tfidf_dim=1000, num_labels=3).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.02)
loss_fn = nn.CrossEntropyLoss(label_smoothing=0.05)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

def freeze_roberta(model):
    for param in model.transformer.parameters():
        param.requires_grad = False

def unfreeze_roberta(model):
    for param in model.transformer.parameters():
        param.requires_grad = True
        
patience = 2  # Number of epochs with no improvement before stopping
best_eval_loss = float('inf')
epochs_no_improve = 0

num_frozen_epochs = 0

for epoch in range(20):  # You can use a large number here
    # ---- Training ----
    if epoch == 0:
        freeze_roberta(model)
    if epoch == num_frozen_epochs:
        unfreeze_roberta(model)
        
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tfidf_dense = batch['tfidf_dense'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, tfidf_dense)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)

    # ---- Evaluation ----
    model.eval()
    eval_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Epoch {epoch+1} - Eval"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            tfidf_dense = batch['tfidf_dense'].to(device)
            labels = batch['labels'].to(device)
    
            logits = model(input_ids, attention_mask, tfidf_dense)
            loss = loss_fn(logits, labels)
            eval_loss += loss.item()
            
            # For metrics:
            probs = torch.softmax(logits, dim=-1).cpu().numpy()   # shape [batch, 3]
            preds = np.argmax(probs, axis=1)                      # shape [batch]
            all_probs.append(probs)
            all_preds.append(preds)
            all_labels.append(labels.cpu().numpy())
    
    avg_eval_loss = eval_loss / len(test_loader)
    
    # Concatenate all results
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)
    all_probs = np.concatenate(all_probs)
    
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    logloss = log_loss(all_labels, all_probs)
    
    print(f"Epoch {epoch+1} - Training loss: {avg_train_loss:.4f} | Eval loss: {avg_eval_loss:.4f} | Macro F1: {macro_f1:.4f} | Logloss: {logloss:.4f}")

    print(f"Epoch {epoch+1} - Training loss: {avg_train_loss:.4f} | Eval loss: {avg_eval_loss:.4f}")

    # ---- Early Stopping ----
    if avg_eval_loss < best_eval_loss:
        best_eval_loss = avg_eval_loss
        epochs_no_improve = 0
        # Optionally save the model's state_dict here if desired:
        torch.save(model.state_dict(), "best_model-r-1000-lower-reg.pt")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epochs.")
        if epochs_no_improve >= patience:
            print("Early stopping!")
            break


In [None]:
model.load_state_dict(torch.load("/kaggle/working/best_model-r-1000-lower-reg.pt"))

freeze_roberta(model)
optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=2e-6)  # You can try a higher LR here
epochs_no_improve = 0
for epoch in range(20):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tfidf_dense = batch['tfidf_dense'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, tfidf_dense)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)

    # ---- Evaluation ----
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Epoch {epoch+1} - Eval"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            tfidf_dense = batch['tfidf_dense'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask, tfidf_dense)
            loss = loss_fn(logits, labels)
            eval_loss += loss.item()
    avg_eval_loss = eval_loss / len(test_loader)

    print(f"Epoch {epoch+1} - Training loss: {avg_train_loss:.4f} | Eval loss: {avg_eval_loss:.4f}")

    # ---- Early Stopping ----
    if avg_eval_loss < best_eval_loss:
        best_eval_loss = avg_eval_loss
        epochs_no_improve = 0
        # Optionally save the model's state_dict here if desired:
        torch.save(model.state_dict(), "best_model-header-o.pt")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epochs.")
        if epochs_no_improve >= patience:
            print("Early stopping!")
            break

In [None]:
from sklearn.metrics import accuracy_score, f1_score
model.load_state_dict(torch.load("/kaggle/working/best_model-r-1000-lower-reg.pt"))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tfidf_dense = batch['tfidf_dense'].to(device)
        labels = batch['labels'].to(device)
        logits = model(input_ids, attention_mask, tfidf_dense)
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='macro')
print(f"Test Accuracy: {acc:.4f}")
print(f"Test Macro F1: {f1:.4f}")


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

model.load_state_dict(torch.load("/kaggle/working/best_model-r-1000-lower-reg.pt"))
model.eval()

from torch.utils.data import DataLoader

def custom_collate(batch):
    # All keys in item dict
    batch_keys = batch[0].keys()
    collated = {}
    for key in batch_keys:
        if key == 'texts':
            # List of strings, just collect as list
            collated[key] = [item[key] for item in batch]
        else:
            # Stack tensors (input_ids, tfidf_dense, labels, etc.)
            collated[key] = torch.stack([item[key] for item in batch])
    return collated

# Use this in your DataLoader
test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=custom_collate  # <-- THIS LINE!
)

all_preds, all_labels, all_texts, all_lengths = [], [], [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tfidf_dense = batch['tfidf_dense'].to(device)
        labels = batch['labels'].to(device)
        texts = batch['texts']  # Add this to your Dataset __getitem__ if not present

        logits = model(input_ids, attention_mask, tfidf_dense)
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        all_texts.extend(texts)
        all_lengths.extend([len(t) for t in texts])

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Evaluate model and collect predictions, labels, texts, lengths as before...

# Build the DataFrame for analysis
results_df = pd.DataFrame({
    "true_label": all_labels,
    "pred_label": all_preds,
    "text": all_texts,
    "text_length": all_lengths
})

# Add the length_bucket column BEFORE filtering
results_df['length_bucket'] = pd.cut(
    results_df['text_length'],
    bins=[0, 50, 100, 256, 512, 4096]
)

# Misclassifications
missed = results_df[results_df['true_label'] != results_df['pred_label']]
print(f"\nNumber of misclassified samples: {len(missed)}/{len(results_df)} ({len(missed)/len(results_df)*100:.2f}%)")
print("True labels of misclassified samples:\n", missed['true_label'].value_counts())
print("Predicted labels of misclassified samples:\n", missed['pred_label'].value_counts())

# Length analysis
print("\nMisclassification rate by length bucket:")
error_rate_by_bucket = missed['length_bucket'].value_counts(normalize=True)
print(error_rate_by_bucket)

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

# Show random misclassified examples
print("\nSample misclassified examples:")
sampled = missed.sample(5, random_state=42)
for idx, row in sampled.iterrows():
    print(f"True: {row['true_label']}, Pred: {row['pred_label']}, Length: {row['text_length']}")
    print(row['text'][:250])  # Print first 250 chars
    print("-" * 50)


In [None]:
import pandas as pd

test_df = pd.read_csv("/kaggle/input/identify-the-author/test/test.csv")
ids = test_df["id"].tolist()
texts = test_df["text"].tolist()

In [None]:
# Example: TF-IDF + SVD for test
X_test_tfidf = vectorizer.transform(texts)
X_test_tfidf_dense = X_test_tfidf.toarray()
# Example: Tokenize for RoBERTa/DistilRoBERTa
test_encodings = tokenizer(
    texts, truncation=True, padding=True, max_length=256, return_tensors='pt'
)

In [None]:
from torch.utils.data import DataLoader

class HybridTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, tfidf_dense):
        self.encodings = encodings
        self.tfidf_dense = torch.tensor(tfidf_dense, dtype=torch.float32)
    def __len__(self):
        return self.tfidf_dense.shape[0]
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['tfidf_dense'] = self.tfidf_dense[idx]
        return item

test_dataset = HybridTestDataset(test_encodings, X_test_tfidf_dense)  # or X_test_tfidf if no SVD
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import numpy as np

model.eval()
all_probs = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tfidf_dense = batch['tfidf_dense'].to(device)

        logits = model(input_ids, attention_mask, tfidf_dense)
        probs = torch.softmax(logits, dim=-1)
        all_probs.append(probs.cpu().numpy())

all_probs = np.concatenate(all_probs, axis=0)  # shape: (num_samples, 3)

In [None]:
# Map column indices to author labels
# Or, for correct ordering, just do:
author_labels = ['EAP', 'HPL', 'MWS']

submission_df = pd.DataFrame(all_probs, columns=author_labels)
submission_df.insert(0, 'id', ids)

eps = 1e-15
submission_df[author_labels] = submission_df[author_labels].clip(eps, 1 - eps)

In [None]:
submission_df.to_csv("submission-best_model-r-1000-lower-reg.csv", index=False, float_format='%.15f')