# Mistral-7B Fine-Tuning

This notebook implements fine-tuning of Mistral-7B for emission factor recommendations.

## Setup
1. Select Runtime > Change runtime type and choose GPU
2. Run cells in sequence

In [None]:
# Check GPU availability
!nvidia-smi

## Install Dependencies

In [None]:
# Install core dependencies
!pip install -q transformers==4.36.2 datasets==2.16.1 peft==0.7.1 accelerate==0.25.0 bitsandbytes==0.41.3 trl==0.7.11 wandb==0.16.3
!pip install -q torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118

# Install neo4j for database access (optional, used only if Neo4j data source is enabled)
!pip install -q neo4j==5.10.0

## Setup Environment

In [None]:
# Setup working environment for training
import os
import sys
import json

# Get the content directory - the root of all operations
content_dir = '/content'
os.chdir(content_dir)
print(f"Starting in directory: {os.getcwd()}")

# First, completely remove any existing Carbon-EF directories to prevent nesting
!rm -rf Carbon-EF
!find /content -name "Carbon-EF" -type d | xargs rm -rf

# Clone the repository to the content directory
!git clone https://github.com/Sbursu/Carbon-EF.git
os.chdir('Carbon-EF')
print(f"Now in directory: {os.getcwd()}")

# Create required directories
!mkdir -p training/logs
!mkdir -p training/data

# Create sample data files directly without using cell magic
print("\nCreating sample instruction data files...")

# Sample instruction data
sample_instructions = [
    {
        "instruction": "What is the emission factor for cement production in USA?",
        "input": "",
        "output": "The emission factor for cement production in the USA is 0.92 kg CO2e/kg. This data is sourced from USEEIO_v2.1.",
        "metadata": {
            "regions": ["USA"],
            "entity_types": ["product"],
            "difficulty": "basic",
            "sources": ["USEEIO_v2.1"]
        }
    },
    {
        "instruction": "Compare the emission factor for wheat production between France and the USA.",
        "input": "",
        "output": "The emission factor for wheat production in France is 0.38 kg CO2e/kg, while in the USA it is 0.41 kg CO2e/kg. The USA has a slightly higher emission factor (by 7.9%). This data is sourced from Agribalyse_3.1 for France and USEEIO_v2.1 for the USA.",
        "metadata": {
            "regions": ["FR", "USA"],
            "entity_types": ["product"],
            "difficulty": "moderate",
            "sources": ["Agribalyse_3.1", "USEEIO_v2.1"]
        }
    },
    {
        "instruction": "What are the emission factors for the top 3 contributors to agricultural emissions in Europe?",
        "input": "",
        "output": "The top 3 contributors to agricultural emissions in Europe are: 1) Beef production at 22.9 kg CO2e/kg, 2) Dairy farming at 1.4 kg CO2e/kg of milk, and 3) Rice cultivation at 3.7 kg CO2e/kg. These figures are from EXIOBASE_3.8 and represent EU average values.",
        "metadata": {
            "regions": ["EU"],
            "entity_types": ["product", "sector"],
            "difficulty": "complex",
            "sources": ["EXIOBASE_3.8"]
        }
    },
    {
        "instruction": "How should I adjust the global emission factor for rice production when using it in Thailand?",
        "input": "",
        "output": "The global emission factor for rice production is 2.8 kg CO2e/kg, but for Thailand, you should apply a regional multiplier of 1.32, resulting in an adjusted factor of 3.7 kg CO2e/kg. This is due to specific cultivation practices and methane emissions from flooded rice fields in the region. Source: IPCC AR6 regional multipliers.",
        "metadata": {
            "regions": ["GLB", "THA"],
            "entity_types": ["product"],
            "difficulty": "methodological",
            "sources": ["IPCC_AR6"]
        }
    },
    {
        "instruction": "What emission factor should I use for a new plant-based leather alternative?",
        "input": "",
        "output": "For a new plant-based leather alternative, you can use 2.1 kg CO2e/kg as a proxy based on similar plant-based materials. This is significantly lower than animal leather (17.0 kg CO2e/kg). However, this is an estimate and should be used with a confidence factor of 0.6. Consider conducting a specific LCA study for your exact production process. Source: OpenLCA database.",
        "metadata": {
            "regions": ["GLB"],
            "entity_types": ["product"],
            "difficulty": "edge_case",
            "sources": ["OpenLCA"]
        }
    }
]

# Write sample data files
data_files = {
    "training/data/instructions_train.json": sample_instructions[:3],
    "training/data/instructions_val.json": sample_instructions[3:4],
    "training/data/instructions_test.json": sample_instructions[4:],
}

for file_path, data in data_files.items():
    print(f"Creating data file: {file_path}")
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

print(f"Working directory: {os.getcwd()}")
!ls -la training/data

# Add repository root to Python path
sys.path.append(os.getcwd())

# Check for any unexpected directories
!find /content -name "Carbon-EF" -type d | sort

## Import Modules

In [None]:
# Import necessary modules with error handling
try:
    import inspect
    
    # First import the data preparation module
    from training.scripts.data_preparation import load_and_prepare_data, format_instruction
    print("Successfully imported data_preparation module")
    
    # Check the function signature to see if use_neo4j is supported
    params = inspect.signature(load_and_prepare_data).parameters
    use_neo4j_supported = 'use_neo4j' in params
    print(f"use_neo4j parameter supported: {use_neo4j_supported}")
    
    # Now import the other modules
    from training.scripts.model_config import setup_model_and_tokenizer, get_training_config
    from training.scripts.training import setup_trainer, evaluate_model, save_model
    print("Successfully imported all required modules")
except ImportError as e:
    print(f"Import error: {e}")
    print("Python path: {}".format(sys.path))
    print("\nContents of training/scripts:")
    !ls -la training/scripts
    print("\nPlease check that all required packages and modules are installed")

## Prepare Training Data

In [None]:
# Check if data files exist
data_files = {
    "train": "training/data/instructions_train.json",
    "val": "training/data/instructions_val.json",
    "test": "training/data/instructions_test.json"
}

for split, file_path in data_files.items():
    if os.path.exists(file_path):
        print(f"Found {split} data: {file_path}")
    else:
        print(f"Warning: {file_path} not found")

# Load and prepare data
try:
    # Check function signature before calling to avoid parameter errors
    import inspect
    params = inspect.signature(load_and_prepare_data).parameters
    use_neo4j_supported = 'use_neo4j' in params
    
    # Call the function with appropriate parameters based on signature
    if use_neo4j_supported:
        print("Using load_and_prepare_data with use_neo4j=False")
        train_data, val_data = load_and_prepare_data(use_neo4j=False)
    else:
        print("Using load_and_prepare_data without use_neo4j parameter")
        train_data, val_data = load_and_prepare_data()
    
    # Format data for training
    train_data = train_data.map(format_instruction)
    val_data = val_data.map(format_instruction)
    
    # Print summary
    print(f"Training examples: {len(train_data['train'])}")
    print(f"Validation examples: {len(val_data['train'])}")
    
    # Show sample
    print("\nSample training example:")
    print(train_data["train"][0]["text"][:300] + "...")
except Exception as e:
    print(f"Error preparing data: {e}")
    import traceback
    traceback.print_exc()
    print("\nPlease check that the data files exist and are properly formatted")

## Initialize Model

In [None]:
# Set up model and tokenizer
try:
    model, tokenizer = setup_model_and_tokenizer()
    print("Model and tokenizer successfully initialized")
    
    # Get training configuration
    config = get_training_config()
    print("\nTraining configuration:")
    for key, value in config.items():
        print(f"  {key}: {value}")
    
    # Set up trainer
    trainer = setup_trainer(model, tokenizer, train_data, val_data, config)
    print("\nTrainer set up successfully")
except Exception as e:
    print(f"Error setting up model: {e}")
    print("Please check your GPU availability and memory")

## Start Training

In [None]:
# Start training
try:
    print("Starting training...")
    trainer.train()
    print("Training completed successfully!")
    
    # Save model
    save_model(model, tokenizer, config['output_dir'])
    print(f"Model saved to {config['output_dir']}/final_model")
except Exception as e:
    print(f"Error during training: {e}")
    print("\nTroubleshooting tips:")
    print("1. Check if you have enough VRAM (T4 or better GPU recommended)")
    print("2. Try reducing batch size or gradient accumulation steps")

## Evaluate Model

In [None]:
# Run evaluation
try:
    print("Running evaluation...")
    results = evaluate_model(model, tokenizer)
    
    # Display results
    print("\nEvaluation results:")
    for result in results:
        print(f"\nQuery: {result['query']}")
        print(f"Response: {result['response']}")
        print()
except Exception as e:
    print(f"Error during evaluation: {e}")

## Test Your Own Queries

In [None]:
from training.scripts.training import generate_recommendation

query = "What is the emission factor for cement production in India?"
try:
    response = generate_recommendation(model, tokenizer, query)
    print(f"Query: {query}")
    print(f"Response: {response}")
except Exception as e:
    print(f"Error generating recommendation: {e}")