In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/indaba-test/submission_indabax.csv
/kaggle/input/indabax-tunisia-2025-anomaly-solver-challenge-2/SampleSubmission.csv
/kaggle/input/indabax-tunisia-2025-anomaly-solver-challenge-2/SecondChallengeStarterNotebook.ipynb
/kaggle/input/indabax-tunisia-2025-anomaly-solver-challenge-2/Train.csv
/kaggle/input/indabax-tunisia-2025-anomaly-solver-challenge-2/Test.csv


In [5]:
!pip install rouge_score 

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    BartForConditionalGeneration, 
    BartTokenizer, 
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer
from tqdm import tqdm
import warnings
from transformers import get_scheduler



ModuleNotFoundError: No module named 'datasets'

In [None]:
# Suppress warnings
warnings.filterwarnings('ignore')

In [None]:
# Constants
MODEL_NAME = "facebook/bart-base"
BATCH_SIZE = 8
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
NUM_EPOCHS = 4
LEARNING_RATE = 5e-4

In [None]:
# Data Preparation Functions
def load_and_preprocess_data(train_path):
    """Load and preprocess the training data."""
    df = pd.read_csv(train_path)
    
    # Drop columns with more than 50% missing values
    threshold = 0.5
    missing_ratio = df.isnull().mean()
    cols_to_drop = missing_ratio[missing_ratio > threshold].index
    df.drop(columns=cols_to_drop, inplace=True)
    
    # Fill remaining NaNs with appropriate values
    for col in df.columns:
        if df[col].dtype == 'object':
            # Fill text/object columns with 'unknown'
            df[col].fillna("unknown", inplace=True)
        else:
            # Fill numerical columns with median (more robust than mean for skewed distributions)
            if df[col].notna().any():  # Only calculate median if there are non-null values
                median_value = df[col].median()
                df[col].fillna(median_value, inplace=True)
            else:
                # If entire column is NA, fill with 0 as fallback
                df[col].fillna(0, inplace=True)
    
    print(f"Dropped columns: {list(cols_to_drop)}")
    print(f"Remaining columns: {df.shape[1]}")
    return df

def create_prompts(df, target_column="improvement_solutions"):
    """
    Create structured and contextual prompts for a telecom assistant model.
    The model will receive KPIs and a description of the problem, and is expected
    to suggest a technical solution and corrective action.
    """
    
    def row_to_prompt(row):
        prompt = (
            "You are a telecom network assistant. Based on the following network KPIs "
            "and the identified issue, suggest a technical solution and a corrective action.\n\n"
        )
        prompt += "Network KPIs and context:\n"
        
        for col in df.columns:
            if col != target_column:
                value = row[col]
                if pd.notna(value) and str(value).strip():
                    prompt += f"- {col}: {value}\n"
        
        prompt += "\nProblem-solving task:\n"
        prompt += "What is the root cause, and what solution or corrective action would you recommend?"
        
        return prompt

    df["prompt"] = df.apply(row_to_prompt, axis=1)
    return df


In [None]:
# Tokenization and Dataset Preparation
def prepare_datasets(train_df, val_df, tokenizer):
    """Prepare train and validation datasets."""
    train_dataset = Dataset.from_pandas(train_df[["prompt", "improvement_solutions"]])
    val_dataset = Dataset.from_pandas(val_df[["prompt", "improvement_solutions"]])
    
    def preprocess_function(examples):
        model_inputs = tokenizer(
            examples["prompt"],
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding="max_length"
        )
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                examples["improvement_solutions"],
                max_length=MAX_TARGET_LENGTH,
                truncation=True,
                padding="max_length"
            )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_val = val_dataset.map(preprocess_function, batched=True)
    
    tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    
    return tokenized_train, tokenized_val


In [None]:
from torch.nn import CrossEntropyLoss

# Training Functions
def initialize_model_and_optimizer():
    """Initialize model, tokenizer, and optimizer."""
    tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
    model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    num_training_steps = len(train_dataloader) * NUM_EPOCHS
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=200, num_training_steps=num_training_steps
    )
    return model, tokenizer, optimizer,lr_scheduler

from torch.cuda.amp import autocast, GradScaler

def train_model(model, train_dataloader, val_dataloader, optimizer,lr_scheduler, tokenizer, device):
    """Train the model with evaluation and mixed precision."""
    scaler = GradScaler()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    for epoch in range(NUM_EPOCHS):
        # --- Training Phase ---
        model.train()
        epoch_loss = 0
        batch_iter = tqdm(train_dataloader, desc=f"Train Epoch {epoch+1}/{NUM_EPOCHS}", leave=False)
        
        for batch in batch_iter:
            batch = {k: v.to(device) for k, v in batch.items()}
            batch['labels'] = torch.clamp(batch['labels'], 0, tokenizer.vocab_size - 1)
            
            optimizer.zero_grad()
            loss_fct = CrossEntropyLoss(label_smoothing=0.1, ignore_index=tokenizer.pad_token_id)
            with autocast():  # AMP enabled forward pass
                outputs = model(**batch)
                logits = outputs.logits
                loss = loss_fct(logits.view(-1, logits.size(-1)), batch["labels"].view(-1))            
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            lr_scheduler.step()
            scaler.update()
            
            epoch_loss += loss.item()
            batch_iter.set_postfix({"loss": f"{loss.item():.4f}"})
        
        # --- Validation Phase ---
        model.eval()
        val_loss = 0
        rouge_accum = {k: [] for k in ['rouge1', 'rouge2', 'rougeL']}
        
        with torch.no_grad():
            val_iter = tqdm(val_dataloader, desc=f"Validating Epoch {epoch+1}", leave=False)
            for batch in val_iter:
                batch = {k: v.to(device) for k, v in batch.items()}
                with autocast():  # Mixed precision inference (optional, safe)
                    outputs = model(**batch)
                    val_loss += outputs.loss.item()
                    
                    preds = model.generate(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        max_length=MAX_TARGET_LENGTH,
                        num_beams=4,
                        length_penalty=1.2  # encourages longer outputs

                    )
                
                decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
                decoded_labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
                
                for pred, label in zip(decoded_preds, decoded_labels):
                    scores = scorer.score(label, pred)
                    for k in rouge_accum:
                        rouge_accum[k].append(scores[k].fmeasure)

        # --- Epoch Summary ---
        print(f"\nEpoch {epoch+1} Summary:")
        print(f"Train Loss: {epoch_loss / len(train_dataloader):.4f}")
        print(f"Val Loss: {val_loss / len(val_dataloader):.4f}")
        print("ROUGE Scores:")
        for k, scores in rouge_accum.items():
            print(f"- {k.upper()}: {np.mean(scores):.4f}" if scores else f"- {k.upper()}: N/A")
        print("="*50)
        torch.cuda.empty_cache()

    return model


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from datasets import Dataset

#test dataset preparation

def prepare_test_dataframe(kpi_path, label_path, id_column='ID'):
    # Load datasets
    test_problems = pd.read_csv(label_path)
    test_df = pd.read_csv(kpi_path)

    # Melt the problems dataframe
    melted_problems = test_problems.melt(
        id_vars=[id_column],
        var_name='problem',
        value_name='present'
    )

    # Filter present problems
    present_problems = melted_problems[melted_problems['present'] == 1]

    # Group problems
    combined_problems = present_problems.groupby(id_column)['problem'].agg(','.join).reset_index()

    # Merge with KPI data
    test_df = test_df.merge(combined_problems, how='left', on=id_column)
    test_df = test_df.rename(columns={'problem': 'network_labels'})
    test_df['network_labels'] = test_df['network_labels'].fillna('normal')
    test_df.fillna("unknown", inplace=True)

    return test_df

def create_prompt_column(df):
    def row_to_prompt(row):
        return "; ".join([f"{col}: {row[col]}" for col in row.index])
    df["prompt"] = df.apply(row_to_prompt, axis=1)
    return df

def tokenize_dataset(df, tokenizer):
    dataset = Dataset.from_pandas(df[["prompt"]])

    def tokenize_fn(examples):
        return tokenizer(
            examples["prompt"],
            padding="max_length",
            max_length=512,
            truncation=True
        )

    tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["prompt"])
    tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
    return tokenized

def run_inference(tokenized_dataset, model, tokenizer, batch_size=8):
    dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    predictions = []

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(decoded_preds)

    return predictions

def save_predictions(df, predictions, output_path, id_column="ID"):
    df["predicted_improvement_solutions"] = predictions

    if id_column in df.columns:
        output_df = df[[id_column, "predicted_improvement_solutions"]]
        output_df.to_csv(output_path, index=False)
    else:
        raise KeyError(f"The column '{id_column}' does not exist in the dataframe.")


In [None]:
# Main Execution
if __name__ == "__main__":
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load and preprocess data
    train_path = "/kaggle/input/indabax-tunisia-2025-anomaly-solver-challenge-2/Train.csv"
    df = load_and_preprocess_data(train_path)
    df = create_prompts(df)
    
    # Split data
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    
    # Initialize model and tokenizer
    model, tokenizer, optimizer,lr_scheduler = initialize_model_and_optimizer()
    model.to(device)
    
    # Prepare datasets
    tokenized_train, tokenized_val = prepare_datasets(train_df, val_df, tokenizer)
    
    # Create dataloaders
    train_dataloader = DataLoader(tokenized_train, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(tokenized_val, batch_size=BATCH_SIZE)
    
    # Train model
    model = train_model(model, train_dataloader, val_dataloader, optimizer,lr_scheduler, tokenizer, device)
    
    # Save model
    model.save_pretrained("bart_solver")
    tokenizer.save_pretrained("bart_solver")
    
    #Trying the model on the test dataset
    # Paths
    kpi_path = "/kaggle/input/indabax-tunisia-2025-anomaly-solver-challenge-2/Test.csv"
    predicted_label_path = "/kaggle/input/indaba-test/submission_indabax.csv"
    output_path = "test_predictions.csv"

    # Process
    df = prepare_test_dataframe(kpi_path, predicted_label_path)
    df = create_prompt_column(df)
    tokenized = tokenize_dataset(df, tokenizer)
    predictions = run_inference(tokenized, model, tokenizer)
    save_predictions(df, predictions, output_path)