In [None]:
# Install required libraries with compatible versions
!pip install torch==2.5.1 transformers==4.44.2 tokenizers==0.19.1 peft==0.12.0 datasets==2.21.0 sentencepiece==0.2.0 accelerate==0.34.2 nltk==3.9.1 rouge_score==0.1.2 bert-score==0.3.13 sacrebleu==2.4.3 numpy>=1.26.4 fsspec>=2024.6.1 spacy==3.7.5 --force-reinstall --no-cache-dir --quiet

# Set environment variables
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Import libraries
import json
import numpy as np
import pandas as pd
import torch
import nltk
import spacy
from datasets import load_dataset
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
from bert_score import score as bert_score
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
    GenerationConfig,
    __version__ as transformers_version
)
from peft import LoraConfig, get_peft_model, TaskType
nltk.download("punkt", quiet=True)
nlp = spacy.load("en_core_web_sm")

# Verify library versions
print(f"Installed transformers version: {transformers_version}")

# Set random seed for reproducibility
def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    set_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
seed_everything()

# Initialize model and tokenizer
model_name = "t5-base"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config)
model.resize_token_embeddings(len(tokenizer))

# Enable gradient checkpointing
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

# Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Debug dataset
print("Available datasets in /kaggle/input/:")
print(os.listdir('/kaggle/input/'))
print("Files in /kaggle/input/feta-dataset/:")
print(os.listdir('/kaggle/input/feta-dataset/'))

# Load FeTaQA dataset
dataset = load_dataset('json', data_files={
    'train': '/kaggle/input/feta-dataset/fetaQA-v1_train.jsonl',
    'valid': '/kaggle/input/feta-dataset/fetaQA-v1_dev.jsonl',
    'test': '/kaggle/input/feta-dataset/fetaQA-v1_test.jsonl'
})
print("Sample train example:", dataset['train'][0])

# Question Parsing Module
def parse_question(question):
    doc = nlp(question)
    keywords = [token.text.lower() for token in doc if token.pos_ in ["NOUN", "PROPN", "NUM"]]
    intent = "aggregate" if any(word in question.lower() for word in ["total", "sum", "average", "count"]) else "lookup"
    return {"keywords": keywords, "intent": intent}

# Information Retrieval Module
def retrieve_relevant_cells(example):
    table_array = example['table_array']
    question = example['question']
    highlighted_cells = example['highlighted_cell_ids']
    parsed = parse_question(question)
    keywords = parsed["keywords"]
    intent = parsed["intent"]
    
    header = table_array[0]
    rows = table_array[1:]
    
    relevant_cols = [i for i, col in enumerate(header) if any(kw.lower() in str(col).lower() for kw in keywords)]
    if not relevant_cols:
        relevant_cols = [i for i, _ in enumerate(header)]
    
    relevant_rows = set()
    for cell_id in highlighted_cells:
        if cell_id[0] > 0:
            relevant_rows.add(cell_id[0] - 1)
    for i, row in enumerate(rows):
        if any(any(kw.lower() in str(cell).lower() for kw in keywords) for cell in row):
            relevant_rows.add(i)
    
    relevant_cells = [[row_idx + 1, col_idx] for row_idx in relevant_rows for col_idx in relevant_cols]
    return relevant_cells, intent

# Reasoning Module
def reason_over_cells(table_array, relevant_cells, intent):
    if intent == "aggregate":
        values = []
        for cell in relevant_cells:
            row, col = cell
            try:
                value = float(table_array[row][col])
                values.append(value)
            except (ValueError, TypeError):
                continue
        if values:
            return f"Aggregated value: {sum(values) / len(values) if 'average' in intent else sum(values)}"
    return None

# Enhanced Table Linearization
def linearize_table_context(example):
    table_array = example['table_array']
    question = example['question']
    highlighted_cells = example['highlighted_cell_ids']
    parsed = parse_question(question)
    intent = parsed["intent"]
    header = table_array[0]
    rows = table_array[1:]
    
    highlighted_str = " [HIGHLIGHTED_CELLS] "
    for cell_id in highlighted_cells:
        row_idx, col_idx = cell_id
        if row_idx == 0:
            cell_value = header[col_idx]
        else:
            cell_value = rows[row_idx - 1][col_idx]
        highlighted_str += f"row {row_idx} column {col_idx}: {cell_value}, "
    highlighted_str = highlighted_str.rstrip(", ")
    
    table_str = " [TABLE] [HEADER] " + " | ".join(str(col) for col in header) + " [ROWS] "
    for i, row in enumerate(rows):
        row_str = "[ROW] "
        for j, cell in enumerate(row):
            col_name = header[j]
            row_str += f"{col_name}: {cell} [SEP] "
        row_str = row_str.rstrip(" [SEP] ")
        table_str += row_str + " "
    
    linearized = f"[QUESTION] {question} [INTENT] {intent}{highlighted_str}{table_str}"
    
    if intent == "aggregate":
        relevant_cells, _ = retrieve_relevant_cells(example)
        reasoning_output = reason_over_cells(table_array, relevant_cells, intent)
        if reasoning_output:
            linearized += f" [REASONING] {reasoning_output}"
    
    return linearized

# Preprocess dataset
def preprocess_examples(examples):
    prefix = 'answer: '
    inputs = [prefix + linearize_table_context(example) for example in examples]
    answers = [example['answer'] for example in examples]
    
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    labels = tokenizer(
        answers,
        max_length=64,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).input_ids
    
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
def process_batch(batch):
    keys = batch.keys()
    batch = [{k: batch[k][i] for k in keys} for i in range(len(batch[list(keys)[0]]))]
    return preprocess_examples(batch)

encoded_train_ds = dataset['train'].map(process_batch, batched=True, remove_columns=dataset['train'].column_names)
encoded_val_ds = dataset['valid'].map(process_batch, batched=True, remove_columns=dataset['valid'].column_names)
encoded_test_ds = dataset['test'].map(process_batch, batched=True, remove_columns=dataset['test'].column_names)

# Post-process text for evaluation
def postprocess_text(preds, labels, metric_name):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    if metric_name == "rouge":
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    elif metric_name == "sacrebleu":
        labels = [[label] for label in labels]
    return preds, labels

# Compute evaluation metrics manually
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)
    
    decoded_preds = []
    for pred in preds:
        try:
            decoded = tokenizer.decode(pred, skip_special_tokens=True)
            decoded_preds.append(decoded if decoded else "<empty>")
        except Exception as e:
            print(f"Error decoding prediction: {e}")
            decoded_preds.append("<error>")
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
    
    save_path = os.path.join(training_args.output_dir, "predictions.json")
    with open(save_path, "w") as f:
        json.dump({"predictions": decoded_preds, "labels": decoded_labels}, f, indent=4)
    
    result = {}
    
    try:
        bleu = BLEU()
        preds_bleu, labels_bleu = postprocess_text(decoded_preds, decoded_labels, "sacrebleu")
        bleu_score = bleu.corpus_score(preds_bleu, labels_bleu).score
        result["sacrebleu"] = round(bleu_score, 4)
    except Exception as e:
        print(f"Error computing sacrebleu: {e}")
        result["sacrebleu"] = 0.0
    
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        preds_rouge, labels_rouge = postprocess_text(decoded_preds, decoded_labels, "rouge")
        rouge_scores = {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}
        for pred, label in zip(preds_rouge, labels_rouge):
            scores = scorer.score(label, pred)
            for key in rouge_scores:
                rouge_scores[key] += scores[key].fmeasure
        for key in rouge_scores:
            rouge_scores[key] /= len(preds_rouge)
            result[key] = round(rouge_scores[key], 4)
    except Exception as e:
        print(f"Error computing rouge: {e}")
        result.update({"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0})
    
    try:
        P, R, F1 = bert_score(decoded_preds, decoded_labels, lang="en", model_type="roberta-base", verbose=False)
        result.update({
            "bertscore_precision": round(P.mean().item(), 4),
            "bertscore_recall": round(R.mean().item(), 4),
            "bertscore_f1": round(F1.mean().item(), 4)
        })
    except Exception as e:
        print(f"Error computing bertscore: {e}")
        result.update({"bertscore_precision": 0.0, "bertscore_recall": 0.0, "bertscore_f1": 0.0})
    
    return result

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    num_train_epochs=5,  # Reduced for faster training
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,  # Lowered to prevent instability
    max_grad_norm=1.0,   # Gradient clipping to prevent nan
    optim="adamw_torch",
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=4,
    eval_strategy="steps",
    eval_steps=500,      # More frequent evaluation
    save_strategy="steps",
    save_steps=500,      # More frequent checkpoints
    fp16=torch.cuda.is_available(),
    logging_steps=250,
    load_best_model_at_end=True,
    metric_for_best_model="bertscore_f1",
    report_to="none",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

# Initialize data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_ds,
    eval_dataset=encoded_val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on test set
test_results = trainer.predict(encoded_test_ds)
print("Test Results:", test_results.metrics)

# Save the model
trainer.save_model("/kaggle/working/tableqa_model")

# Generate submission
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
predictions = []
for example in dataset['test']:
    input_text = "answer: " + linearize_table_context(example)
    inputs = tokenizer(input_text, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=64,
            num_beams=4,
            early_stopping=True
        )
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(pred if pred else "<empty>")
submission_df = pd.DataFrame({
    "question": [example["question"] for example in dataset['test']],
    "predicted_answer": predictions,
    "true_answer": [example["answer"] for example in dataset['test']]
})
submission_df.to_csv("/kaggle/working/submission.csv", index=False)
print("Submission saved to /kaggle/working/submission.csv")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 20.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 20.0.0 which is incompatible.
sigstore 3.6.1 requires rich~=13.0, but you have rich 14.0.0 which is incompatible.
ydata-profiling 4.16.1 requires matplotlib<=3.10,>=3.5, but you have matplotlib 3.10.1 which is incompatible.
nilearn 0.11.1 requires scikit-learn>=1.4.0, but you have scikit-learn 1.2.2 which is incompatible.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.9.1 which is incompatible.
google-colab 1.0.0 requires notebook==6.5.5, but you have notebook 6.5.4 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.

Map:   0%|          | 0/7326 [00:00<?, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Map:   0%|          | 0/2003 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Sacrebleu,Rouge1,Rouge2,Rougel,Bertscore Precision,Bertscore Recall,Bertscore F1
500,1.9532,1.477901,48.327,0.4864,0.2917,0.4053,0.9082,0.8833,0.8951
1000,1.5503,1.273255,56.551,0.6037,0.3771,0.4965,0.9358,0.9098,0.9224
1500,1.514,1.271583,56.551,0.6036,0.3775,0.4965,0.9359,0.9101,0.9225
2000,1.4885,1.271592,56.551,0.6036,0.3775,0.4965,0.9358,0.9101,0.9225


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['robert

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Results: {'test_loss': 1.2327446937561035, 'test_sacrebleu': 31.4666, 'test_rouge1': 0.6122, 'test_rouge2': 0.3833, 'test_rougeL': 0.5038, 'test_bertscore_precision': 0.9358, 'test_bertscore_recall': 0.9118, 'test_bertscore_f1': 0.9234, 'test_runtime': 954.8764, 'test_samples_per_second': 2.098, 'test_steps_per_second': 1.049}


