# üá™üá¨ EgySentiment: Hybrid Training (Phase 2.5)

**Author:** AI Research Scientist  
**Goal:** Fine-tune `Llama-3-8b-Instruct` using a **Hybrid Dataset** to fix domain mismatch.
**Environment:** Google Colab (Free Tier - T4 GPU)  

### üöÄ Strategy: Hybrid Training
1.  **Data Source A:** `Financial PhraseBank` (4800+ samples) -> Teaches general financial sentiment.
2.  **Data Source B:** 50% of `testing_data.jsonl` (Local Egyptian Data) -> Teaches Egyptian news style/length.
3.  **Test Set:** Remaining 50% of `testing_data.jsonl` -> Evaluates performance on unseen Egyptian news.

### üîß Hyperparameters (Tuned)
*   `max_steps`: **120** (Increased from 60 for better adaptation)
*   `warmup_steps`: **10**
*   `learning_rate`: **2e-4**
*   `r` (LoRA Rank): **16**

## 1. Setup & Installation

In [None]:
%%capture
# Install Unsloth, Xformers (Flash Attention), and other deps
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install scikit-learn matplotlib seaborn datasets

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

## 2. Data Loading (Hybrid Strategy)

In [None]:
import json
from datasets import load_dataset, Dataset, concatenate_datasets
import pandas as pd

# --- 1. Load Financial PhraseBank (General Finance) ---
print("üìö Loading Financial PhraseBank...")
try:
    phrasebank = load_dataset("gtfintechlab/financial_phrasebank_sentences_allagree", "5768", split="train")
except Exception as e:
    print(f"‚ö†Ô∏è Error loading PhraseBank: {e}")
    phrasebank = load_dataset("financial_phrasebank", "sentences_allagree", split="train", trust_remote_code=True)

# --- 2. Load Local Egyptian Data ---
print("üá™üá¨ Loading Local Egyptian Data...")
dataset_path = "testing_data.jsonl"
local_data = []

try:
    with open(dataset_path, 'r', encoding='utf-8') as f:
        for line in f:
            local_data.append(json.loads(line))
    
    local_df = pd.DataFrame(local_data)
    local_dataset = Dataset.from_pandas(local_df)
    print(f"‚úì Loaded {len(local_dataset)} local samples")
    
    # Split Local Data: 50% Train (to mix), 50% Test (to evaluate)
    local_split = local_dataset.train_test_split(test_size=0.5, seed=42)
    local_train = local_split["train"]
    local_test = local_split["test"]
    print(f"  - Local Train: {len(local_train)} (Mixed into training)")
    print(f"  - Local Test:  {len(local_test)} (Reserved for evaluation)")
    
except FileNotFoundError:
    print("‚ö†Ô∏è 'testing_data.jsonl' not found. Using dummy data.")
    local_train = Dataset.from_dict({"text": [], "sentiment": [], "reasoning": []})
    local_test = Dataset.from_dict({"text": [], "sentiment": [], "reasoning": []})

# --- 3. Format Prompts ---
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Analyze the sentiment of the following financial news. Provide the sentiment (positive/negative/neutral) and a brief reasoning.

### Input:
{}

### Response:
{{"sentiment": "{}", "reasoning": "{}"}}"""

EOS_TOKEN = tokenizer.eos_token

def format_phrasebank(examples):
    label_map = {0: "negative", 1: "neutral", 2: "positive"}
    inputs = examples["sentence"]
    labels = examples["label"]
    texts = []
    for input_text, label in zip(inputs, labels):
        sentiment = label_map[label]
        text = alpaca_prompt.format(input_text, sentiment, "Sentiment inferred from financial context.") + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

def format_local(examples):
    inputs = examples["text"]
    sentiments = examples["sentiment"]
    reasonings = examples["reasoning"]
    texts = []
    for input_text, sentiment, reasoning in zip(inputs, sentiments, reasonings):
        text = alpaca_prompt.format(input_text, sentiment, reasoning) + EOS_TOKEN
        texts.append(text)
    return {"text": texts, "ground_truth_sentiment": sentiments}

# Apply Formatting
print("üîÑ Formatting datasets...")
phrasebank_formatted = phrasebank.map(format_phrasebank, batched=True)
local_train_formatted = local_train.map(format_local, batched=True)
local_test_formatted = local_test.map(format_local, batched=True)

# --- 4. Create Hybrid Training Set ---
# Remove extra columns to ensure compatibility
phrasebank_formatted = phrasebank_formatted.select_columns(["text"])
local_train_formatted = local_train_formatted.select_columns(["text"])

train_dataset = concatenate_datasets([phrasebank_formatted, local_train_formatted])
train_dataset = train_dataset.shuffle(seed=42) # Shuffle to mix sources

print(f"\n‚úÖ Final Hybrid Training Set: {len(train_dataset)} samples")
print(f"‚úÖ Final Test Set (Egyptian): {len(local_test_formatted)} samples")

## 3. Model Configuration (LoRA)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

## 4. Training (Tuned Parameters)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,  # Increased warmup for hybrid data
        max_steps = 120,    # Increased steps (2x) for better adaptation
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer_stats = trainer.train()

## 5. Evaluation on Egyptian Data

In [None]:
FastLanguageModel.for_inference(model)

from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm

y_true = []
y_pred = []

print(f"Running inference on {len(local_test_formatted)} Egyptian samples...")

for i in tqdm(range(len(local_test_formatted))):
    # Prepare input
    input_text = local_test_formatted[i]["text"].split("### Response:")[0] + "### Response:\n"
    ground_truth = local_test_formatted[i]["ground_truth_sentiment"]
    
    inputs = tokenizer([input_text], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
    response = tokenizer.batch_decode(outputs)[0]
    
    try:
        generated_part = response.split("### Response:")[1]
        match = re.search(r'"sentiment":\s*"(positive|negative|neutral)"', generated_part, re.IGNORECASE)
        if match:
            pred_sentiment = match.group(1).lower()
        else:
            pred_sentiment = "neutral"
    except:
        pred_sentiment = "neutral"
        
    y_true.append(ground_truth)
    y_pred.append(pred_sentiment)

# Metrics
print("\nüá™üá¨ Egyptian Data Performance Report (Hybrid Model):")
print(classification_report(y_true, y_pred, labels=["positive", "neutral", "negative"]))

cm = confusion_matrix(y_true, y_pred, labels=["positive", "neutral", "negative"])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=["positive", "neutral", "negative"], 
            yticklabels=["positive", "neutral", "negative"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix: Egyptian Financial News')
plt.show()

## 6. Export to GGUF

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

if True: 
    model.save_pretrained_gguf(
        "model_gguf", 
        tokenizer, 
        quantization_method = "q4_k_m",
        maximum_memory_usage = 0.6,
    )
    print("‚úÖ GGUF saved!")