In [None]:
from pipeline import Pipeline, Task
from fetch_data import fetch_arxiv_papers
from clean_data import get_filtered_sections_papers
from condense_data import condensed_papers
from qa_pairs_generation import generate_all_qa_pairs

pipeline = Pipeline("dataset_creation", [
    Task(fetch_arxiv_papers, {"query": "deep learning", "max_results": 500}, False),
    Task(get_filtered_sections_papers, {}, False),
    Task(condensed_papers, {"amount": 100}, False),
    Task(generate_all_qa_pairs, {"max_retry": 5, "amount": 100}, False)
])

pipeline.run()

In [None]:
from model_train import gpu_train
from pipeline import Pipeline

gpu_train(Pipeline("dataset_creation").get_data_from_step(3), "unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit", 1024, "test")

In [None]:
import random

def pick_random_qa_pairs(qa_pairs, sample_amount):
    return random.sample(qa_pairs, sample_amount) 

In [None]:
from lm_studio_caller import call_llm
from tqdm import tqdm

def generate_base_model_answers(qa_pairs, repetition):
    for qa in tqdm(qa_pairs):
        qa["llm_output"] = []
        for _ in range(repetition):
            qa["llm_output"].append(call_llm(sys_prompt="Answer concisely.", usr_prompt=qa["question"], temperature=0.7))
    
    return qa_pairs

In [None]:
from use_fine_tunned_model import load_optimized_model, question_model

def generate_finetuned_model_answers(qa_pairs, repetition):
    model, tokenizer = load_optimized_model(
        "unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit",
        "test"
    )
    
    for qa in tqdm(qa_pairs):
        qa["llm_finetuned_output"] = []
        for _ in range(repetition):
            qa["llm_finetuned_output"].append(question_model(model, tokenizer, qa["question"], "Answer concisely."))

    return qa_pairs

In [None]:
from pipeline import Pipeline, Task

pipeline = Pipeline("evaluation", [
        Task(pick_random_qa_pairs, {"sample_amount": 5}, False),
        Task(generate_base_model_answers, {"repetition": 3}, False),
        Task(generate_finetuned_model_answers, {"repetition": 3}, False)
    ],
    initial_data=[qa for qas_pair in Pipeline("dataset_creation").get_data_from_step(3).values() for qa in qas_pair]
)

pipeline.run()

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_sbert_similarities(dataset):
    similarities_scores = []
    for qa in dataset:
        answer_emb = sbert_model.encode(qa["answer"], convert_to_tensor=True)
        llm_output_embs = [sbert_model.encode(llm_output, convert_to_tensor=True) for llm_output in qa["llm_output"]]
        llm_finetuned_output_embs = [sbert_model.encode(llm_output, convert_to_tensor=True) for llm_output in qa["llm_output"]]
        
        scores = []
        for emb1 in llm_finetuned_output_embs:
            scores.append(util.pytorch_cos_sim(answer_emb, emb1).item())
        
        diff_scores = []
        for emb1 in llm_output_embs:
            for emb2 in llm_finetuned_output_embs:
                diff_scores.append(util.pytorch_cos_sim(emb1, emb2).item())
        
        similarities_scores.append({
            "question": qa["question"],
            "avg_score": np.mean(scores),
            "avg_diff_score": np.mean(diff_scores)
        })
    
    return pd.DataFrame(similarities_scores).sort_values("avg_score", ascending=True).reset_index(drop=True)

In [None]:
import plotly.express as px
from pipeline import Pipeline

qa_pairs = Pipeline("evaluation").get_data_from_step(2)
similarities_scores = compute_sbert_similarities(qa_pairs)

def plot_similarity(df):
    if df.empty:
        print("Warning: No similarity scores computed!")
        return

    fig = px.bar(
        df.melt(id_vars=["question"], value_vars=["avg_score", "avg_diff_score"],
                var_name="Model Type", value_name="Similarity Score"),
        x="Similarity Score",
        y="question",
        orientation="h",
        color="Model Type",
        color_discrete_map={"avg_score": "green", "avg_diff_score": "blue"},
        title="Fine-Tuned vs Non-Fine-Tuned Model Sbert Similarities",
        labels={"question": "Question", "Similarity Score": "Similarity Score"},
        text_auto=".2f",
        hover_data={"question": False}
    )
    
    fig.update_traces(
        textposition="inside",
        customdata=df[["avg_score", "avg_diff_score"]].values,
        hovertemplate="<b>→ Average Similarity Score:</b> %{customdata[0]:.2f}<br>"
                      "<b>→ Average Similarity Score Diff:</b> %{customdata[1]:.2f}<br>"
    )
    
    fig.update_layout(
        yaxis=dict(
            title="Question",
            tickmode="array",
            tickfont=dict(size=12),
            automargin=True
        ),
        barmode="group",
        width=2250,
        height=900
    )

    fig.show()

plot_similarity(similarities_scores)