# NBA Scouting Report Generator - Fine-Tuning

This notebook walks through the process of fine-tuning a language model to generate NBA scouting reports from player statistics.

In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn torch transformers datasets nltk rouge-score tqdm

In [None]:
# Import dependencies
import os
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel,
    Trainer, TrainingArguments,
    EarlyStoppingCallback,
    AutoModelForCausalLM, AutoTokenizer
)
from datasets import load_metric
import nltk
from nltk.translate.bleu_score import sentence_bleu
import random
import logging
from rouge_score import rouge_scorer
from tqdm.notebook import tqdm

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
)
logger = logging.getLogger(__name__)

# Download necessary NLTK data
nltk.download('punkt')

## 1. Dataset Preparation

First, we'll create a custom dataset class for handling NBA player stats and scouting reports.

In [None]:
class ScoutingReportDataset(Dataset):
    """
    Dataset class for NBA player stats and scouting reports.
    """
    def __init__(self, data_file, tokenizer, max_length=512):
        """
        Initialize the dataset.
        
        Args:
            data_file (str): Path to the JSON file containing the dataset
            tokenizer: Hugging Face tokenizer
            max_length (int): Maximum sequence length
        """
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Load data
        with open(data_file, 'r') as f:
            self.data = json.load(f)
        
        logger.info(f"Loaded {len(self.data)} examples from {data_file}")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Create prompt with player stats
        prompt = f"Generate a comprehensive NBA scouting report for the following player stats:\n\n"
        prompt += f"Name: {item['player_name']}\n"
        prompt += f"Position: {item['position']}\n"
        prompt += f"Age: {item['age']}\n"
        prompt += f"Height: {item['height']}\n"
        prompt += f"Weight: {item['weight']} lbs\n"
        prompt += f"Team: {item['team']}\n"
        prompt += f"Season Stats: {item['season_stats']}\n\n"
        prompt += f"Scouting Report:"
        
        # Tokenize inputs
        encodings = self.tokenizer(
            prompt, 
            item['scouting_report'], 
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        input_ids = encodings.input_ids[0]
        attention_mask = encodings.attention_mask[0]
        
        # Find where the prompt ends and response begins
        prompt_encodings = self.tokenizer(prompt, truncation=True, max_length=self.max_length)
        prompt_length = len(prompt_encodings.input_ids)
        
        # Create labels with -100 for prompt (will be ignored in loss calculation)
        labels = input_ids.clone()
        labels[:prompt_length] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
            "prompt": prompt,
            "report": item['scouting_report']
        }

Now, let's define functions to prepare and split our dataset.

In [None]:
def prepare_dataset(stats_file, reports_file, output_file):
    """
    Prepare dataset by combining player stats and scouting reports.
    
    Args:
        stats_file (str): Path to the CSV file containing player stats
        reports_file (str): Path to the CSV file containing scouting reports
        output_file (str): Path to save the combined dataset
    """
    # Load player stats
    player_stats = pd.read_csv(stats_file)
    
    # Load scouting reports
    scouting_reports = pd.read_csv(reports_file)
    
    # Merge the dataframes on player ID/name
    combined_data = pd.merge(
        player_stats, 
        scouting_reports, 
        on='player_id',  # Adjust based on your actual column names
        how='inner'
    )
    
    print(f"Combined dataset has {len(combined_data)} rows")
    print(f"Sample columns: {combined_data.columns.tolist()[:10]}...")
    
    # Convert to the format needed for fine-tuning
    dataset = []
    for _, row in combined_data.iterrows():
        # Format season stats as a string
        season_stats = (
            f"PPG: {row['pts']}, RPG: {row['reb']}, "
            f"APG: {row['ast']}, SPG: {row['stl']}, "
            f"BPG: {row['blk']}, FG%: {row['fg_pct']}, "
            f"3P%: {row['fg3_pct']}, FT%: {row['ft_pct']}"
        )
        
        dataset.append({
            'player_id': row['player_id'],
            'player_name': row['player_name'],
            'position': row['position'],
            'age': row['age'],
            'height': row['height'],
            'weight': row['weight'],
            'team': row['team'],
            'season_stats': season_stats,
            'scouting_report': row['scouting_report']
        })
    
    # Save the processed dataset
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w') as f:
        json.dump(dataset, f, indent=2)
    
    print(f"Processed dataset saved to {output_file}")
    return dataset

def split_dataset(data_file, output_dir, test_size=0.15, val_size=0.15):
    """
    Split the dataset into training, validation, and test sets.
    
    Args:
        data_file (str): Path to the JSON file containing the dataset
        output_dir (str): Directory to save the split datasets
        test_size (float): Proportion of data to use for testing
        val_size (float): Proportion of data to use for validation
    """
    # Load data
    with open(data_file, 'r') as f:
        data = json.load(f)
    
    # Split into train and temp (val + test)
    train_data, temp_data = train_test_split(data, test_size=(test_size + val_size), random_state=42)
    
    # Split temp into val and test
    relative_test_size = test_size / (test_size + val_size)
    val_data, test_data = train_test_split(temp_data, test_size=relative_test_size, random_state=42)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the splits
    with open(os.path.join(output_dir, 'train.json'), 'w') as f:
        json.dump(train_data, f, indent=2)
    
    with open(os.path.join(output_dir, 'val.json'), 'w') as f:
        json.dump(val_data, f, indent=2)
    
    with open(os.path.join(output_dir, 'test.json'), 'w') as f:
        json.dump(test_data, f, indent=2)
    
    print(f"Dataset split into {len(train_data)} training, {len(val_data)} validation, and {len(test_data)} test examples")

Let's run the data preparation steps. You'll need to provide your own data files or generate synthetic data for testing.

In [None]:
# Set up directories
data_dir = "data"
os.makedirs(data_dir, exist_ok=True)

# Define paths
stats_file = os.path.join(data_dir, "player_stats.csv")
reports_file = os.path.join(data_dir, "scouting_reports.csv")
combined_file = os.path.join(data_dir, "combined_dataset.json")

# Check if files exist
if os.path.exists(stats_file) and os.path.exists(reports_file):
    # Prepare dataset
    prepare_dataset(stats_file, reports_file, combined_file)
    
    # Split dataset
    split_dataset(combined_file, data_dir)
else:
    print(f"Please place your player stats CSV at {stats_file} and scouting reports at {reports_file}")
    print("For the assignment, you'll need to collect or generate this data.")

## 2. Model Selection and Fine-Tuning

In [None]:
def select_model(model_name="gpt2"):
    """
    Select and load a pre-trained model for fine-tuning.
    
    Args:
        model_name (str): Name of the pre-trained model to use
        
    Returns:
        tuple: (model, tokenizer)
    """
    print(f"Loading model: {model_name}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    return model, tokenizer

def fine_tune_model(
    model, tokenizer, 
    train_file, val_file, 
    output_dir, 
    learning_rate=5e-5,
    batch_size=8,
    num_epochs=3,
    warmup_steps=500
):
    """
    Fine-tune the model on the training data.
    
    Args:
        model: Pre-trained model
        tokenizer: Tokenizer for the model
        train_file (str): Path to the training data
        val_file (str): Path to the validation data
        output_dir (str): Directory to save the fine-tuned model
        learning_rate (float): Learning rate for fine-tuning
        batch_size (int): Batch size for training
        num_epochs (int): Number of training epochs
        warmup_steps (int): Number of warmup steps
    """
    # Create datasets
    train_dataset = ScoutingReportDataset(train_file, tokenizer)
    val_dataset = ScoutingReportDataset(val_file, tokenizer)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_steps=warmup_steps,
        learning_rate=learning_rate,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, 'logs'),
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    )
    
    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    # Train the model
    print("Starting fine-tuning")
    trainer.train()
    
    # Save the fine-tuned model
    model_save_path = os.path.join(output_dir, 'final_model')
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    
    print(f"Fine-tuned model saved to {model_save_path}")
    
    return model, tokenizer

## 3. Hyperparameter Optimization

In [None]:
def hyperparameter_optimization(
    model_name, 
    train_file, 
    val_file, 
    output_dir,
    configs=[
        {"learning_rate": 5e-5, "batch_size": 4, "num_epochs": 3},
        {"learning_rate": 2e-5, "batch_size": 8, "num_epochs": 5},
        {"learning_rate": 1e-5, "batch_size": 16, "num_epochs": 10}
    ]
):
    """
    Perform hyperparameter optimization to find the best training configuration.
    
    Args:
        model_name (str): Name of the pre-trained model to use
        train_file (str): Path to the training data
        val_file (str): Path to the validation data
        output_dir (str): Directory to save the models and results
        configs (list): List of hyperparameter configurations to try
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Results dictionary
    results = {}
    
    # Try each configuration
    for i, config in enumerate(configs):
        print(f"Testing configuration {i+1}/{len(configs)}: {config}")
        
        # Create subdirectory for this configuration
        config_dir = os.path.join(output_dir, f"config_{i+1}")
        os.makedirs(config_dir, exist_ok=True)
        
        # Load model
        model, tokenizer = select_model(model_name)
        
        # Fine-tune with this configuration
        fine_tune_model(
            model, tokenizer,
            train_file, val_file,
            config_dir,
            learning_rate=config["learning_rate"],
            batch_size=config["batch_size"],
            num_epochs=config["num_epochs"]
        )
        
        # Load fine-tuned model and evaluate on validation set
        model = AutoModelForCausalLM.from_pretrained(os.path.join(config_dir, 'final_model'))
        tokenizer = AutoTokenizer.from_pretrained(os.path.join(config_dir, 'final_model'))
        
        # Create validation dataset
        val_dataset = ScoutingReportDataset(val_file, tokenizer)
        
        # Create trainer
        trainer = Trainer(
            model=model,
            args=TrainingArguments(
                output_dir=config_dir,
                per_device_eval_batch_size=8
            ),
            eval_dataset=val_dataset
        )
        
        # Evaluate
        eval_results = trainer.evaluate()
        
        # Store results
        results[f"config_{i+1}"] = {
            "config": config,
            "eval_loss": eval_results["eval_loss"],
            "perplexity": np.exp(eval_results["eval_loss"])
        }
        
        print(f"Configuration {i+1} results: {results[f'config_{i+1}']}")
    
    # Find best configuration
    best_config_id = min(results, key=lambda k: results[k]["eval_loss"])
    best_config = results[best_config_id]
    
    print(f"Best configuration: {best_config_id}")
    print(f"Config details: {best_config['config']}")
    print(f"Eval loss: {best_config['eval_loss']}")
    print(f"Perplexity: {best_config['perplexity']}")
    
    # Save results
    with open(os.path.join(output_dir, 'hyperparameter_results.json'), 'w') as f:
        json.dump(results, f, indent=2)
    
    # Plot results
    plt.figure(figsize=(10, 6))
    
    config_ids = list(results.keys())
    losses = [results[c]["eval_loss"] for c in config_ids]
    perplexities = [results[c]["perplexity"] for c in config_ids]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    ax1.bar(config_ids, losses)
    ax1.set_xlabel('Configuration')
    ax1.set_ylabel('Validation Loss')
    ax1.set_title('Validation Loss by Configuration')
    
    ax2.bar(config_ids, perplexities)
    ax2.set_xlabel('Configuration')
    ax2.set_ylabel('Perplexity')
    ax2.set_title('Perplexity by Configuration')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'hyperparameter_results.png'))
    plt.show()
    
    return best_config_id, best_config

Let's set up the model and run hyperparameter optimization if we have data available.

In [None]:
# Set up model directory
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)
hyperparam_dir = os.path.join(model_dir, "hyperparameter_search")

# Check if we have data files
train_file = os.path.join(data_dir, "train.json")
val_file = os.path.join(data_dir, "val.json")

if os.path.exists(train_file) and os.path.exists(val_file):
    # For this notebook, we'll use simplified hyperparameter configs to save time
    # In a real project, you'd want more extensive testing
    simplified_configs = [
        {"learning_rate": 5e-5, "batch_size": 4, "num_epochs": 2},  # Reduced epochs for demo
        {"learning_rate": 2e-5, "batch_size": 8, "num_epochs": 3}
    ]
    
    # Run hyperparameter optimization
    best_config_id, best_config = hyperparameter_optimization(
        "gpt2",  # You can change to other models like "distilgpt2" for faster training
        train_file,
        val_file,
        hyperparam_dir,
        configs=simplified_configs
    )
    
    # Save the best config info for reference
    with open(os.path.join(model_dir, "best_config.json"), 'w') as f:
        json.dump({
            "best_config_id": best_config_id,
            "config_details": best_config
        }, f, indent=2)
else:
    print(f"Training data not found at {train_file} or {val_file}")
    print("Please complete the data preparation steps first.")

## 4. Final Model Training

Now, let's train our final model using the best hyperparameters we found.

In [None]:
# Check if we have identified a best configuration
best_config_file = os.path.join(model_dir, "best_config.json")

if os.path.exists(best_config_file):
    # Load best config
    with open(best_config_file, 'r') as f:
        best_config_info = json.load(f)
    
    best_config_id = best_config_info["best_config_id"]
    best_config = best_config_info["config_details"]
    
    print(f"Training final model with best configuration: {best_config['config']}")
    
    # Set up final model directory
    final_model_dir = os.path.join(model_dir, "final_model")
    os.makedirs(final_model_dir, exist_ok=True)
    
    # Load fresh model
    model, tokenizer = select_model("gpt2")
    
    # Train on full training set with best hyperparameters
    fine_tune_model(
        model, tokenizer,
        train_file, val_file,
        final_model_dir,
        learning_rate=best_config["config"]["learning_rate"],
        batch_size=best_config["config"]["batch_size"],
        num_epochs=best_config["config"]["num_epochs"]
    )
    
    print("Final model training complete!")
else:
    print("No best configuration found. Please run hyperparameter optimization first.")

## 5. Model Evaluation

In [None]:
def generate_report(model, tokenizer, player_stats, max_length=1024):
    """
    Generate a scouting report for a player based on their stats.
    
    Args:
        model: Fine-tuned model
        tokenizer: Tokenizer for the model
        player_stats (dict): Dictionary containing player statistics
        max_length (int): Maximum length of generated text
        
    Returns:
        str: Generated scouting report
    """
    # Create prompt
    prompt = f"Generate a comprehensive NBA scouting report for the following player stats:\n\n"
    prompt += f"Name: {player_stats['player_name']}\n"
    prompt += f"Position: {player_stats['position']}\n"
    prompt += f"Age: {player_stats['age']}\n"
    prompt += f"Height: {player_stats['height']}\n"
    prompt += f"Weight: {player_stats['weight']} lbs\n"
    prompt += f"Team: {player_stats['team']}\n"
    prompt += f"Season Stats: {player_stats['season_stats']}\n\n"
    prompt += f"Scouting Report:"
    
    # Tokenize prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate report
    with torch.no_grad():
        output_sequences = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"] if "attention_mask" in inputs else None,
            max_length=max_length,
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            do_sample=True,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode and return generated text
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    
    # Extract only the scouting report part (remove the prompt)
    report = generated_text[len(prompt):]
    
    return report.strip()

def evaluate_model(model, tokenizer, test_file, output_file):
    """
    Evaluate the fine-tuned model on the test set.
    
    Args:
        model: Fine-tuned model
        tokenizer: Tokenizer for the model
        test_file (str): Path to the test data
        output_file (str): Path to save the evaluation results
    """
    # Load test data
    with open(test_file, 'r') as f:
        test_data = json.load(f)
    
    # Set up metrics
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    results = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    bleu_scores = []
    
    # Evaluate each example
    for item in tqdm(test_data, desc="Evaluating model"):
        # Generate report
        generated_report = generate_report(model, tokenizer, item)
        
        # Calculate ROUGE scores
        rouge_scores = scorer.score(item['scouting_report'], generated_report)
        
        # Calculate BLEU score
        reference = nltk.word_tokenize(item['scouting_report'].lower())
        candidate = nltk.word_tokenize(generated_report.lower())
        bleu = sentence_bleu([reference], candidate)
        
        # Store results
        results.append({
            'player_name': item['player_name'],
            'position': item['position'],
            'actual_report': item['scouting_report'],
            'generated_report': generated_report,
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure,
            'bleu': bleu
        })
        
        rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
        rougeL_scores.append(rouge_scores['rougeL'].fmeasure)
        bleu_scores.append(bleu)
    
    # Calculate average scores
    avg_results = {
        'avg_rouge1': np.mean(rouge1_scores),
        'avg_rouge2': np.mean(rouge2_scores),
        'avg_rougeL': np.mean(rougeL_scores),
        'avg_bleu': np.mean(bleu_scores)
    }
    
    # Save results
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w') as f:
        json.dump({
            'individual_results': results,
            'average_scores': avg_results
        }, f, indent=2)
    
    print("Evaluation completed")
    print(f"Average ROUGE-1: {avg_results['avg_rouge1']:.4f}")
    print(f"Average ROUGE-2: {avg_results['avg_rouge2']:.4f}")
    print(f"Average ROUGE-L: {avg_results['avg_rougeL']:.4f}")
    print(f"Average BLEU: {avg_results['avg_bleu']:.4f}")
    
    # Plot results
    plt.figure(figsize=(12, 8))
    
    metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BLEU']
    scores = [
        avg_results['avg_rouge1'],
        avg_results['avg_rouge2'],
        avg_results['avg_rougeL'],
        avg_results['avg_bleu']
    ]
    
    plt.bar(metrics, scores)
    plt.ylim(0, 1)
    plt.xlabel('Metric')
    plt.ylabel('Score')
    plt.title('Model Evaluation Metrics')
    
    for i, v in enumerate(scores):
        plt.text(i, v + 0.05, f'{v:.4f}', ha='center')
    
    plt.savefig(os.path.join(os.path.dirname(output_file), 'evaluation_metrics.png'))
    plt.show()
    
    return results, avg_results

Now, let's evaluate our final model on the test set.

In [None]:
# Set up results directory
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)

# Check if we have a final model and test data
final_model_dir = os.path.join(model_dir, "final_model")
test_file = os.path.join(data_dir, "test.json")

if os.path.exists(final_model_dir) and os.path.exists(test_file):
    # Load final model
    model = AutoModelForCausalLM.from_pretrained(final_model_dir)
    tokenizer = AutoTokenizer.from_pretrained(final_model_dir)
    
    # Evaluate model
    results, avg_scores = evaluate_model(
        model, tokenizer,
        test_file,
        os.path.join(results_dir, "evaluation_results.json")
    )
else:
    print(f"Final model not found at {final_model_dir} or test data not found at {test_file}")
    print("Please complete the model training steps first.")

## 6. Compare with Baseline

Let's compare our fine-tuned model with the base model (without fine-tuning).

In [None]:
def compare_with_baseline(base_model_name, finetuned_model_dir, test_file, output_file):
    """
    Compare the fine-tuned model with the base model.
    
    Args:
        base_model_name (str): Name of the base pre-trained model
        finetuned_model_dir (str): Directory containing the fine-tuned model
        test_file (str): Path to the test data
        output_file (str): Path to save the comparison results
    """
    # Load models
    base_model, base_tokenizer = select_model(base_model_name)
    
    finetuned_model = AutoModelForCausalLM.from_pretrained(finetuned_model_dir)
    finetuned_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)
    
    # Load test data
    with open(test_file, 'r') as f:
        test_data = json.load(f)
    
    # Sample a subset for comparison (to save time)
    sample_size = min(5, len(test_data))
    sample_data = random.sample(test_data, sample_size)
    
    # Set up metrics
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    base_results = []
    finetuned_results = []
    
    # Evaluate each model
    for item in tqdm(sample_data, desc="Comparing models"):
        # Generate reports
        base_report = generate_report(base_model, base_tokenizer, item)
        finetuned_report = generate_report(finetuned_model, finetuned_tokenizer, item)
        
        # Calculate ROUGE scores
        base_rouge = scorer.score(item['scouting_report'], base_report)
        finetuned_rouge = scorer.score(item['scouting_report'], finetuned_report)
        
        # Calculate BLEU scores
        reference = nltk.word_tokenize(item['scouting_report'].lower())
        
        base_candidate = nltk.word_tokenize(base_report.lower())
        base_bleu = sentence_bleu([reference], base_candidate)
        
        finetuned_candidate = nltk.word_tokenize(finetuned_report.lower())
        finetuned_bleu = sentence_bleu([reference], finetuned_candidate)
        
        # Store results
        base_results.append({
            'player_name': item['player_name'],
            'report': base_report,
            'rouge1': base_rouge['rouge1'].fmeasure,
            'rouge2': base_rouge['rouge2'].fmeasure,
            'rougeL': base_rouge['rougeL'].fmeasure,
            'bleu': base_bleu
        })
        
        finetuned_results.append({
            'player_name': item['player_name'],
            'report': finetuned_report,
            'rouge1': finetuned_rouge['rouge1'].fmeasure,
            'rouge2': finetuned_rouge['rouge2'].fmeasure,
            'rougeL': finetuned_rouge['rougeL'].fmeasure,
            'bleu': finetuned_bleu
        })
    
    # Calculate average scores
    base_avg = {
        'avg_rouge1': np.mean([r['rouge1'] for r in base_results]),
        'avg_rouge2': np.mean([r['rouge2'] for r in base_results]),
        'avg_rougeL': np.mean([r['rougeL'] for r in base_results]),
        'avg_bleu': np.mean([r['bleu'] for r in base_results])
    }
    
    finetuned_avg = {
        'avg_rouge1': np.mean([r['rouge1'] for r in finetuned_results]),
        'avg_rouge2': np.mean([r['rouge2'] for r in finetuned_results]),
        'avg_rougeL': np.mean([r['rougeL'] for r in finetuned_results]),
        'avg_bleu': np.mean([r['bleu'] for r in finetuned_results])
    }
    
    # Calculate improvement percentages
    improvements = {
        metric: ((finetuned_avg[f'avg_{metric}'] - base_avg[f'avg_{metric}']) / base_avg[f'avg_{metric}'] * 100) 
        if base_avg[f'avg_{metric}'] > 0 else float('inf')
        for metric in ['rouge1', 'rouge2', 'rougeL', 'bleu']
    }
    
    # Save results
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w') as f:
        json.dump({
            'base_model': {
                'individual_results': base_results,
                'average_scores': base_avg
            },
            'finetuned_model': {
                'individual_results': finetuned_results,
                'average_scores': finetuned_avg
            },
            'improvements': improvements
        }, f, indent=2)
    
    # Plot comparison
    plt.figure(figsize=(12, 8))
    
    metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BLEU']
    base_scores = [
        base_avg['avg_rouge1'],
        base_avg['avg_rouge2'],
        base_avg['avg_rougeL'],
        base_avg['avg_bleu']
    ]
    
    finetuned_scores = [
        finetuned_avg['avg_rouge1'],
        finetuned_avg['avg_rouge2'],
        finetuned_avg['avg_rougeL'],
        finetuned_avg['avg_bleu']
    ]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 8))
    rects1 = ax.bar(x - width/2, base_scores, width, label='Base Model')
    rects2 = ax.bar(x + width/2, finetuned_scores, width, label='Fine-tuned Model')
    
    ax.set_ylim(0, 1)
    ax.set_xlabel('Metric')
    ax.set_ylabel('Score')
    ax.set_title('Model Comparison: Base vs. Fine-tuned')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics)
    ax.legend()
    
    for i, v in enumerate(base_scores):
        ax.text(i - width/2, v + 0.05, f'{v:.2f}', ha='center', va='bottom')
        
    for i, v in enumerate(finetuned_scores):
        ax.text(i + width/2, v + 0.05, f'{v:.2f}', ha='center', va='bottom')
    
    fig.tight_layout()
    plt.savefig(os.path.join(os.path.dirname(output_file), 'model_comparison.png'))
    plt.show()
    
    print("Model comparison completed")
    print(f"Base model average scores: {base_avg}")
    print(f"Fine-tuned model average scores: {finetuned_avg}")
    print(f"Improvements: {improvements}")
    
    # Print a sample comparison
    if len(sample_data) > 0:
        print("\nSample Comparison:")
        print(f"Player: {sample_data[0]['player_name']}")
        print("\nActual Report:")
        print(sample_data[0]['scouting_report'])
        print("\nBase Model Report:")
        print(base_results[0]['report'])
        print("\nFine-tuned Model Report:")
        print(finetuned_results[0]['report'])
    
    return base_results, finetuned_results, improvements

Let's run the comparison.

In [None]:
# Check if we have a final model and test data
if os.path.exists(final_model_dir) and os.path.exists(test_file):
    # Compare with baseline
    base_results, finetuned_results, improvements = compare_with_baseline(
        "gpt2",  # Base model
        final_model_dir,  # Fine-tuned model
        test_file,
        os.path.join(results_dir, "model_comparison.json")
    )
else:
    print(f"Final model not found at {final_model_dir} or test data not found at {test_file}")
    print("Please complete the model training steps first.")

## 7. Test with a Custom Example

Let's test our fine-tuned model with a custom player example.

In [None]:
# Create a sample player
sample_player = {
    'player_name': "LeBron James",
    'position': "SF",
    'age': 37,
    'height': "6'9\"",
    'weight': 250,
    'team': "Los Angeles Lakers",
    'season_stats': "PPG: 30.3, RPG: 8.2, APG: 6.2, SPG: 1.3, BPG: 1.1, FG%: 0.524, 3P%: 0.359, FT%: 0.756"
}

# Check if we have a final model
if os.path.exists(final_model_dir):
    # Load final model
    model = AutoModelForCausalLM.from_pretrained(final_model_dir)
    tokenizer = AutoTokenizer.from_pretrained(final_model_dir)
    
    # Generate report
    print("Generating scouting report for LeBron James...")
    report = generate_report(model, tokenizer, sample_player)
    
    print("\nGenerated Scouting Report:")
    print(report)
else:
    print(f"Final model not found at {final_model_dir}")
    print("Please complete the model training steps first.")

## 8. Save Model for Streamlit App

Now, let's ensure our model is saved in the correct format for the Streamlit app to use.

In [None]:
# Check if we have a final model
if os.path.exists(final_model_dir):
    # The model is already saved in the correct format for Streamlit to use
    print(f"Model is saved at {final_model_dir} and ready for use in the Streamlit app.")
    print("\nTo start the Streamlit app, run the following command in your terminal:")
    print("streamlit run app.py")
    
    # Create a simple text file with the model path for the app to reference
    with open("model_path.txt", "w") as f:
        f.write(final_model_dir)
    
    print("\nModel path saved to model_path.txt for the app to reference.")
else:
    print(f"Final model not found at {final_model_dir}")
    print("Please complete the model training steps first.")

## Summary

In this notebook, we've completed the entire fine-tuning pipeline for our NBA Scouting Report Generator:

1. Dataset preparation
2. Model selection
3. Hyperparameter optimization
4. Final model training
5. Model evaluation
6. Comparison with the baseline model
7. Testing with a custom example
8. Saving the model for the Streamlit app

The fine-tuned model is now ready to be used in the Streamlit application, which will provide a user-friendly interface for generating NBA scouting reports.