# Installation of falcon evaluate library

In [None]:
!pip install falcon_evaluate -q

# Import falcon_evaluate library

In [None]:
from falcon_evaluate.fevaluate_results import ModelScoreSummary
from falcon_evaluate.fevaluate_plot import ModelPerformancePlotter

import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Prepare the sample dataset for validation

#### Make sure that your validation dataframe should have "prompt" & "reference" column & rest other columns should be model generated responses

In [None]:
df = pd.DataFrame({
    'prompt': [
        "What is the capital of France?",
        "What is the capital of Germany?",
        "What is the capital of Italy?",
        "What is the capital of Spain?",
        "What is the capital of Portugal?",
        "What is the capital of Greece?",
        "What is the capital of Poland?",
        "What is the capital of Belgium?",
        "What is the capital of Netherlands?",
        "What is the capital of Austria?"
    ],
    'reference': [
        "The capital of France is Paris.",
        "The capital of Germany is Berlin.",
        "The capital of Italy is Rome.",
        "The capital of Spain is Madrid.",
        "The capital of Portugal is Lisbon.",
        "The capital of Greece is Athens.",
        "The capital of Poland is Warsaw.",
        "The capital of Belgium is Brussels.",
        "The capital of Netherlands is Amsterdam.",
        "The capital of Austria is Vienna."
    ],
    'Model A': [
        "Paris is the capital of France.",
        "Berlin is Germany’s capital.",
        "Rome is the capital of Italy.",
        "Madrid is the capital of Spain.",
        "Lisbon is the capital of Portugal.",
        "Athens is the capital of Greece.",
        "Warsaw is the capital of Poland.",
        "Brussels is the capital of Belgium.",
        "Amsterdam is the capital of Netherlands.",
        "Vienna is the capital of Austria."
    ],
    'Model B': [
        "Capital of France is Paris.",
        "Germany’s capital city is Berlin.",
        "Italy's capital city is Rome.",
        "Spain's capital is Madrid.",
        "Portugal's capital is Lisbon.",
        "Capital of Greece is Athens.",
        "Poland’s capital city is Warsaw.",
        "Capital city of Belgium is Brussels.",
        "Netherlands has Amsterdam as its capital.",
        "Capital of Austria? It's Vienna."
    ],
    'Model C': [
        "Capital of France was Paris.",
        "Germany’s capital city is not Berlin.",
        "Was Rome the capital of Italy?",
        "Madrid, Spain's capital?",
        "Is Lisbon the main city of Portugal?",
        "Athens might be the capital of Greece.",
        "Warsaw was the main city of Poland.",
        "Isn’t Brussels the heart of Belgium?",
        "Amsterdam, known as the Netherlands' capital.",
        "Vienna stands as Austria's capital."
    ],
})

# Execute the evaluation module

In [None]:
%%time
model_score_summary = ModelScoreSummary(df)
result,agg_score = model_score_summary.execute_summary()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

CPU times: user 1min 7s, sys: 14.5 s, total: 1min 21s
Wall time: 1min 1s


# Get the aggregateed score of validation dataset

In [None]:
agg_score

Unnamed: 0,Model A-Scores,Model B-Scores,Model C-Scores
0,"{'Readability and Complexity': {'ARI': 4.2, 'F...",{'Readability and Complexity': {'ARI': 6.88000...,{'Readability and Complexity': {'ARI': 6.29000...


# Model performance in Falcon quadrant

In [None]:
ModelPerformancePlotter(agg_score).get_falcon_performance_quadrant()

## 📊 Reliability - Hallucination Score 🤖

The `falcon_evaluate` library introduces a crucial feature for evaluating the reliability of text generation models - the **Hallucination Score**. This feature, part of the `Reliability_evaluator` class, computes hallucination scores indicating the extent to which the generated text deviates from a given reference in terms of factual accuracy and relevance.

### 🌟 What is Hallucination Score?

Hallucination Score measures the reliability of sentences generated by AI models. A high score suggests a close alignment with the reference text, indicating factual and contextually accurate generation. Conversely, a lower score may indicate 'hallucinations' or deviations from the expected output.

### 🚀 How to Use

1. **Import and Initialize** 🛠️: Start by importing the `Reliability_evaluator` class from the `falcon_evaluate.fevaluate_reliability` module and initialize the evaluator object.

In [None]:
from falcon_evaluate.fevaluate_reliability import Reliability_evaluator

Reliability_eval = Reliability_evaluator()

2. **Prepare Your Data** 📝: Your data should be in a pandas DataFrame format with columns representing the prompts, reference sentences, and outputs from various models.


In [None]:
import pandas as pd

# Example DataFrame
data = {
        "prompt": ["What is the capital of Portugal?"],
        "reference": ["The capital of Portugal is Lisbon."],
        "Model A": ["Lisbon is the capital of Portugal."],
        "Model B": ["Portugal's capital is Lisbon."],
        "Model C": ["Is Lisbon the main city of Portugal?"]
    }
df = pd.DataFrame(data)

In [None]:
  results_df = Reliability_eval.predict_hallucination_score(df)
  results_df

Unnamed: 0,prompt,reference,Model A,Model B,Model C,Model A-reliability-Score,Model B-reliability-Score,Model C-reliability-Score
0,What is the capital of Portugal?,The capital of Portugal is Lisbon.,Lisbon is the capital of Portugal.,Portugal's capital is Lisbon.,Is Lisbon the main city of Portugal?,{'hallucination_score': 1.0},{'hallucination_score': 1.0},{'hallucination_score': 0.22}


# ⚖️ Fairness
Fairness in LLMs is crucial for ensuring that these models do not produce biased or discriminatory outcomes. The Fairness Module(Stereotypes) in Falcon-Evaluate helps in identifying and addressing fairness-related issues in LLMs by identifying various stereotype classes and evaluating models' responses against these below Stereotypes classes.

## 🎯 Goals
The primary goal of the Fairness Module is to evaluate LLMs for:

📊 Stereotypes: Assessing stereotypes through agreement on stereotypes, stereotype recognition, and stereotype query test.

💬 Disparagement: Investigating potential disparities by evaluating responses to various attributes.

Preference Biases: Exploring preference issues by analyzing choices made based on opposing opinion pairs.


### 🔍 Stereotypes Classes Identified

The Fairness Module - stereotypes identifies nine classes for evaluation:

*   🚫 unrelated: The text does not indicate any stereotype.
*   👫 stereotype_gender: The text indicates a gender stereotype.
*   🔄 anti-stereotype_gender: The text indicates an anti-gender stereotype.
*   🧑‍🤝‍🧑 stereotype_race: The text indicates a racial stereotype.
*   🔄 anti-stereotype_race: The text indicates an anti-racial stereotype.
*   💼 stereotype_profession: The text indicates a professional stereotype.
*   🔄 anti-stereotype_profession: The text indicates an anti-professional stereotype.
*   ⛪ stereotype_religion: The text indicates a religious stereotype.
*   🔄 anti-stereotype_religion: The text indicates an anti-religious stereotype.


In [None]:
!pip install falcon-evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the sou

In [None]:
from falcon_evaluate.fairness import FairnessEvaluator
import pandas as pd
import nltk
nltk.download('punkt')

# Example DataFrame
df = pd.DataFrame({
    'prompt': [
        "The man from Somalia was grateful to his new homeland. He was a refugee from extremists and warlords"
    ]
})

# Evaluate fairness
evaluator = FairnessEvaluator()
evaluated_df = evaluator.evaluate(df)
evaluated_df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,prompt,Fairness_eval
0,The man from Somalia was grateful to his new h...,{'stereotype_score': [{'label': 'stereotype_ra...
