## BertScore

In [6]:
import os
from bert_score import score
import pandas as pd
import torch

def chunking(text, maxtok=1022):
    tokens = text.split()
    return [' '.join(tokens[i:i + maxtok]) for i in range(0, len(tokens), maxtok)]

def loaddir(responsedir, referencedir):

    # all the files present in the directory
    responsefile = set(os.listdir(responsedir))
    referencefile = set(os.listdir(referencedir))

    # bertscore metrics
    precisionval = []
    recallval = []
    f1scoreval = []

    for filename in responsefile:
        if filename in referencefile:
            responsefilepath = os.path.join(responsedir, filename)
            referencefilepath = os.path.join(referencedir, filename)

            # reading both the files, summary and reference
            with open(responsefilepath, 'r', encoding='utf-8') as file1:
                responsetext = file1.read().strip()
            with open(referencefilepath, 'r', encoding='utf-8') as file2:
                referencetext = file2.read().strip()

            # chunking both reference and summary files
            reschunks = chunking(responsetext)
            refchunks = chunking(referencetext)

            # Initialize lists for chunk scores
            precisionchunks = []
            recallschunks = []
            f1scorechunks = []

            # each chunk is evaluated
            for sourcechunk, referencechunk in zip(reschunks, refchunks):
                P, R, F1 = score([sourcechunk], [referencechunk], model_type="facebook/bart-large-mnli", lang="en", verbose=False)
                precisionchunks.append(P.mean().item())
                recallschunks.append(R.mean().item())
                f1scorechunks.append(F1.mean().item())

            # Accumulate the average scores for this file
            precisionval.append(sum(precisionchunks) / len(precisionchunks))
            recallval.append(sum(recallschunks) / len(recallschunks))
            f1scoreval.append(sum(f1scorechunks) / len(f1scorechunks))
        else:
            print(f"File missing: {filename}")

    # Calculate average BERTScore
    if precisionval:
        avgprecision = sum(precisionval) / len(precisionval)
        avgrecall = sum(recallval) / len(recallval)
        avgf1 = sum(f1scoreval) / len(f1scoreval)
    else:
        avgprecision = 0
        avgrecall = 0
        avgf1 = 0

    return avgprecision, avgrecall, avgf1


# list of directories of summaries generate by different models
responsedirs = [
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/deepseek-llm/deepseek-llmfinal/',
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/gemma/gemmafinal/',
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/gemma2/gemma2final/',
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/llama3/llama3final/',
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/llama3.1/llama3.1final/',
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/mistral/mistralfinal/',
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/qwen/qwenfinal/',
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/solar/solarfinal/',
    '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/xwinlm/xwinlmfinal/'
]

referencedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

results=[]

# compare files across directories
for reponse in responsedirs:
    avgprecision, avgrecall, avgf1 = loaddir(reponse, referencedir)

    # Add the new column for the average of the three scores
    avgscore = (avgprecision + avgrecall + avgf1) / 3

    # appending the results
    results.append({
        "Directory": reponse,
        "Precision": avgprecision,
        "Recall": avgrecall,
        "F1": avgf1,
        "Average scores": avgscore
    })

# to display the entire column and row in the output
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', None)

# forming dataframe for result
df = pd.DataFrame(results)

# saving the csv file
csvfile = 'BertScore/average_Bert_scores_PYPDF2CHUNK.csv'
os.makedirs(os.path.dirname(csvfile), exist_ok=True)
df.to_csv(csvfile, index=False)

df


Unnamed: 0,Directory,Precision,Recall,F1,Average scores
0,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/deepseek-llm/deepseek-llmfinal/,0.642502,0.587365,0.612897,0.614254
1,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/gemma/gemmafinal/,0.602037,0.527311,0.560504,0.563284
2,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/gemma2/gemma2final/,0.600156,0.57341,0.586125,0.586564
3,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/llama3/llama3final/,0.62357,0.575935,0.598359,0.599288
4,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/llama3.1/llama3.1final/,0.606003,0.559828,0.581578,0.58247
5,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/mistral/mistralfinal/,0.622539,0.567567,0.589551,0.593219
6,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/qwen/qwenfinal/,0.592934,0.522143,0.553912,0.55633
7,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/solar/solarfinal/,0.618149,0.566479,0.590397,0.591675
8,../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/xwinlm/xwinlmfinal/,0.632062,0.589665,0.608228,0.609985


## DeepSeekLLM Evaluation

In [1]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0  
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'GemmaResults.csv')
    df.to_csv(csvpath, index=False)

    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")


# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/deepseek-llm/deepseek-llmfinal/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 174 minutes to execute



Running evaluation round 1 for file: INQ000385719.txt
Chunk (1, 1) Relevance Score: 4
Chunk (1, 1) Coherence Score: 4
Chunk (1, 1) Accuracy Score: 4
Chunk (1, 1) Fluency Score: 4
Evaluation for INQ000385719.txt (Round 1):
Relevance: Here is my evaluation:

**Relevance Score: 4**

The summary accurately captures three key points from the source text: the strategy for addressing the pandemic, the current situation and projections, and global experience. The summary also mentions the need for a decision on local lockdowns versus national interventions. While some details are missing, such as specific examples of successful countries' strategies, the overall framework is well-represented.

Note that I deducted 1 point because the summary does not explicitly mention the "current picture" or the specific statistics mentioned in the source text (e.g., case incidence rising to 200/100,000 by end of October). Additionally, the summary does not capture the tone and context of the original email

## Llama3 Evaluation

In [1]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""


client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0  
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'GemmaResults.csv')
    df.to_csv(csvpath, index=False)

    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")


# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/llama3/llama3final/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 215 minutes to execute



Running evaluation round 1 for file: INQ000396686.txt
Chunk (1, 1) Relevance Score: 3
Chunk (1, 1) Coherence Score: 4
Chunk (1, 1) Accuracy Score: 3
Chunk (1, 1) Fluency Score: 4
Evaluation for INQ000396686.txt (Round 1):
Relevance: Based on the provided source text, I would evaluate the summary chunk as follows:

Relevance score: 3/5

The summary chunk appears to be a notice from the Senior Information Risk Officer of Welsh Government regarding social media use by staff. The original text provides guidance on securing personal social media accounts and emphasizes the importance of using two-factor authentication (2FA). The notice also addresses some specific providers, such as Twitter, Facebook, and LinkedIn.

However, there is no direct relevance between this summary chunk and the actual text provided, which pertains to a security incident involving poor practice and policy breaches in Welsh Government social media use.
Coherence: Coherence score: 4

The summary is largely accurate,

## Gemma Evaluation

In [1]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""


client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0  
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'GemmaResults.csv')
    df.to_csv(csvpath, index=False)

    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")

# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/gemma/gemmafinal/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 198 minutes to execute


Running evaluation round 1 for file: INQ000350513.txt
Chunk (1, 1) Relevance Score: 4
Chunk (1, 1) Coherence Score: 4
Chunk (1, 1) Accuracy Score: 2
Chunk (1, 1) Fluency Score: 4
Evaluation for INQ000350513.txt (Round 1):
Relevance: Evaluation:

Relevance: 4/5

The summary is mostly relevant to the source text, capturing the main points of Chris Whitty's email regarding UK strategy for managing COVID-19 outbreak. The summary accurately reflects the short-term and long-term measures proposed by Chris Whitty, including immediate containment, short-term suppression, and long-term mitigation. However, it does not fully capture the nuances of Rob Good's ideas about a more refined approach to the end-game and his emphasis on eliminating societal mixing.

Note: I gave a score of 4 out of 5 because while the summary covers the core points, it does not entirely convey Rob Good's concerns or the discussion about SAGE twin peak model.
Coherence: Based on the source text, I would give the summary

## Gemma2 Evaluation

In [1]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""


client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0  
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'Gemma2Results.csv')
    df.to_csv(csvpath, index=False)

    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")

# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/gemma2/gemma2final/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 222 minutes to execute


Running evaluation round 1 for file: INQ000350057.txt
Chunk (1, 1) Relevance Score: 3
Chunk (1, 1) Coherence Score: 4
Chunk (1, 1) Accuracy Score: 4
Chunk (1, 1) Fluency Score: 1
Evaluation for INQ000350057.txt (Round 1):
Relevance: **Relevance: 3/5**

The summary provided is somewhat relevant to the two source texts, but it does not fully capture their essence. The sources discuss school closures during a pandemic, barriers to adhering to COVID-19 guidelines in Wales, and strategies for managing transmission during the festive season. While the summary mentions some of these points, such as considering school closures as a last resort and emphasizing clear communication and support, it dilutes the importance of each source's specific findings.

The first source highlights the potential harms caused by school closures and suggests prioritizing other mitigation measures. The second source discusses factors that affect adherence to COVID-19 guidelines in Wales, such as understanding of 

## Mistral Evalaution

In [3]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""


client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0  
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'MistralResults.csv')
    df.to_csv(csvpath, index=False)


    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")
    
# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/mistral/mistralfinal/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 216 minutes to execute


Running evaluation round 1 for file: INQ000350057.txt
Chunk (1, 1) Relevance Score: 4
Chunk (1, 1) Coherence Score: 3
Chunk (1, 1) Accuracy Score: 4
Chunk (1, 1) Fluency Score: 2
Evaluation for INQ000350057.txt (Round 1):
Relevance: Evaluation:

Relevance score: 4/5

The summary accurately conveys the main points of the original text, including the importance of recognizing minor breaches of guidelines, the impact of increased mixing during festive seasons, and the potential risks of complacency after vaccine availability. However, it does not fully capture the nuances of the text, particularly regarding the need for a rational basis in public communication about risk and the potential inequities in enforcement.

Score: 4/5
Coherence: Based on the summary provided, I would evaluate the coherence as a score of 3 out of 5.

The summary accurately captures some main points from the text, such as:

* The importance of recognizing that minor breaches of guidelines can have a significant im

## Qwen Evaluation

In [2]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""


client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0  
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'QwenResults.csv')
    df.to_csv(csvpath, index=False)

    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")



# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/qwen/qwenfinal/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 203 minutes to execute


Running evaluation round 1 for file: INQ000350057.txt
Chunk (1, 1) Relevance Score: 3
Chunk (1, 1) Coherence Score: 4
Chunk (1, 1) Accuracy Score: 4
Chunk (1, 1) Fluency Score: 0
Evaluation for INQ000350057.txt (Round 1):
Relevance: To evaluate the summary, I will compare it with the provided source text. Here's my assessment:

**Relevance score:** 3/5

The summary mentions that the original text discusses the current situation in Wales regarding COVID-19 infections, introduces non-pharmaceutical interventions (NPIs) in the pre-Christmas period, and talks about social distancing, quarantining, and self-isolation. These points are indeed present in the source text. However, the summary does not accurately reflect the scope of the original text, which also discusses epidemiological findings on high-risk exposure settings for COVID-19 and the potential impacts of Christmas on transmission.

The summary fails to mention these additional topics, which constitutes a significant portion of t

## Solar Evaluation

In [1]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""


client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0  
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'SolarResults.csv')
    df.to_csv(csvpath, index=False)


    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")

# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/solar/solarfinal/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 210 minutes to execute


Running evaluation round 1 for file: INQ000350691.txt
Chunk (1, 1) Relevance Score: 4
Chunk (1, 1) Coherence Score: 4
Chunk (1, 1) Accuracy Score: 4
Chunk (1, 1) Fluency Score: 4
Evaluation for INQ000350691.txt (Round 1):
Relevance: Based on the source text provided, I would evaluate the summary as follows:

Relevance score: 4/5

The summary accurately captures the main themes of the original statement by Jane Hutt MS, including the recognition of International Day of Disabled People, the impact of COVID-19 on disabled people, and the Welsh Government's initiatives to support disability projects and employment opportunities. However, some minor details are missing from the summary, such as the specific allocation of funds (£200,000 + £100,000) and the launch of a new toolkit ("A More Equal Wales: A Practical Guide for Employers employing Disabled People"). Overall, the summary provides a good overview of the statement's key points.
Coherence: Based on the provided source text, I evalu

## XwinLM Evaluation

In [3]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""


client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0 
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'XwinLM.csv')
    df.to_csv(csvpath, index=False)

    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")


# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/xwinlm/xwinlmfinal/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 216 minutes to execute


Running evaluation round 1 for file: INQ000350513.txt
Chunk (1, 1) Relevance Score: 4
Chunk (1, 1) Coherence Score: 4
Chunk (1, 1) Accuracy Score: 2
Chunk (1, 1) Fluency Score: 2
Evaluation for INQ000350513.txt (Round 1):
Relevance: Based on the evaluation criteria, I would give the summary a relevance score of **4** out of 5.

The summary is mostly relevant as it captures the core ideas and key points from the source text. It correctly identifies the UK government's strategy as focusing on three main goals: suppressing the virus, saving lives, and protecting the economy. It also mentions the SAGE twin-peak model and the idea of immediate containment within households.

However, there are a few areas where the summary could be improved to achieve full relevance:

1. The alternative approach advocated by Rob Orford is not fully captured in the summary. While it mentions the idea of closing non-essential businesses and reducing societal mixing, it does not provide as much detail as the 

## Llama3.1 Evaluation

In [4]:
import pandas as pd
import re
import os
from openai import OpenAI

# **********************************************************************
# evaluation using prompt template
PROMPTTEMP = """
Summary and source text are given below. Please evaluate the summary based on the source text provided. And follow the criteria for the marking purposes.

Evaluation Criteria:
{criteria}

Source Text Chunk:
{document}

Summary Chunk:
{summary}

Evaluation Form (scores ONLY):
{metric} score from 1 to 5
"""

# guidance for rating
RELEVANCY_CRITERIA = """
Relevance(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
The source text and the summary passed must be relevant.
Assess how well the summary has captured the text from source text.
Please rate from 1 to 5:
"""
COHERENCE_CRITERIA = """
Coherence(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How well does the source text and summary fits.
Make sure that the summary is well structured and clear.
Please rate from 1 to 5:
"""
ACCURACY_CRITERIA = """
Accuracy (1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
How accurate is the summary with the source text file passed.
Also, find if the summary is slightly out of context. 
Please rate from 1 to 5:
"""
FLUENCY_CRITERIA = """
Fluency(1-5): Please rate the summary from the scale (1-5) where 1 is lowest and 5 is the highest score.
Check if the summary is well written in terms of spellings, grammars and ensure that the sentences are sensible.
Please rate the fluency from the scale 1 to 5, :
"""


client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

def findscore(response):
    pattern = re.search(r'\b\d+\b', response)
    if pattern:
        score = int(pattern.group(0))
        if score <= 5:
            return score
        
    return 0 
    
def chunktext(text,label):
    words = text.split()
    max_tokens=4000
    overlap=100
    chunks = []
    chunk = []

    for word in words:
        if len(chunk) + len(word.split()) > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = chunk[-overlap:]  # begin chunk where it ended

        chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

# # ****************************************************
#     print(f"\n{label} Chunks:")
#     for i, chunk in enumerate(chunks, 1):
#         print(f"Chunk {i}: {chunk}")
# # ****************************************************
    return chunks

def modeleval(summary, source_text):
    sourcechunks = chunktext(source_text,"Source")  # source text chunks
    summarychunks = chunktext(summary,"Summary")     # summary text chunk

    coherencescores = []
    relevancyscores = []
    accuracyscores = []
    fluencyscores = []

    # for each combination of source and summary chunks
    for i, srchunks in enumerate(sourcechunks):
        for j, summchunk in enumerate(summarychunks):

            # print(f"\nProcessing Source Chunk {i+1} and Summary Chunk {j+1}:")
            # prompting to check relevancy
            prompt = PROMPTTEMP.format(
                criteria=RELEVANCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Relevance"
            )
            # print(f"\nRelevance Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            relevancytext = response.choices[0].message.content
            relevancyscores.append(findscore(relevancytext))
            relevancyscore = findscore(relevancytext)
            print(f"Chunk ({i+1}, {j+1}) Relevance Score: {relevancyscore}")

            # prompting to check coherence
            prompt = PROMPTTEMP.format(
                criteria=COHERENCE_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Coherence"
            )
            # print(f"\nCoherence Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            coherencetext = response.choices[0].message.content
            coherencescores.append(findscore(coherencetext))
            coherencescore = findscore(coherencetext)
            print(f"Chunk ({i+1}, {j+1}) Coherence Score: {coherencescore}")

            # prompting to check accuracy
            prompt = PROMPTTEMP.format(
                criteria=ACCURACY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Accuracy"
            )
            # print(f"\nAccuracy Prompt:\n{prompt}")
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            accuracytext = response.choices[0].message.content
            accuracyscores.append(findscore(accuracytext))
            accuracyscore = findscore(accuracytext)
            print(f"Chunk ({i+1}, {j+1}) Accuracy Score: {accuracyscore}")

            # prompting to check fluency
            prompt = PROMPTTEMP.format(
                criteria=FLUENCY_CRITERIA,
                document=srchunks,
                summary=summchunk,
                metric="Fluency"
            )
            # print(f"\nFluency Prompt:\n{prompt}") 
            response = client.chat.completions.create(
                model='llama3',
                messages=[{'role': 'user', 'content': prompt}],
                max_tokens=150,
            )
            fluencytext = response.choices[0].message.content
            fluencyscores.append(findscore(fluencytext))
            fluencyscore = findscore(fluencytext)
            print(f"Chunk ({i+1}, {j+1}) Fluency Score: {fluencyscore}")


    # calculate average scores across all chunks
    avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
    avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
    avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
    avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

    return (avgrelevancyScore, avgcoherenceScore, avgaccuracyScore, avgfluencyScore,
            relevancytext, coherencetext, accuracytext, fluencytext)

def evalsummary(summarydir, sourcedir):
    data = []
    
    summaryfile = set(os.listdir(summarydir))
    sourcefile = set(os.listdir(sourcedir))

    for filename in summaryfile:
        if filename in sourcefile:
            summaryfilepath = os.path.join(summarydir, filename)
            sourcefilepath = os.path.join(sourcedir, filename)

            with open(summaryfilepath, 'r', encoding='utf-8') as file:
                summary = file.read()

            with open(sourcefilepath, 'r', encoding='utf-8') as file:
                source = file.read()

            relevancyscores = []
            coherencescores = []
            accuracyscores = []
            fluencyscores = []

            for i in range(1, 4):  # 3 times evaluation
                print(f"\nRunning evaluation round {i} for file: {filename}")

                relevanceScore, coherenceScore, accuracyScore, fluencyScore, relevancyText, coherenceText, accuracyText, fluencyText = modeleval(summary, source)

                
                # append scores to respective lists
                relevancyscores.append(relevanceScore)
                coherencescores.append(coherenceScore)
                accuracyscores.append(accuracyScore)
                fluencyscores.append(fluencyScore)

                # evaluation results statements and scores
                print(f"Evaluation for {filename} (Round {i}):")
                print(f"Relevance: {relevancyText}")
                print(f"Coherence: {coherenceText}")
                print(f"Accuracy: {accuracyText}")
                print(f"Fluency: {fluencyText}")

                print(f"Relevance Score (Round {i}): {relevanceScore}")
                print(f"Coherence Score (Round {i}): {coherenceScore}")
                print(f"Accuracy Score (Round {i}): {accuracyScore}")
                print(f"Fluency Score (Round {i}): {fluencyScore}")
                print("************************************************")

            # calculate the average of the three rounds
            avgrelevancyScore = sum(relevancyscores) / len(relevancyscores) if relevancyscores else 0
            avgcoherenceScore = sum(coherencescores) / len(coherencescores) if coherencescores else 0
            avgaccuracyScore = sum(accuracyscores) / len(accuracyscores) if accuracyscores else 0
            avgfluencyScore = sum(fluencyscores) / len(fluencyscores) if fluencyscores else 0

            # storing the average scores in the data dictionary
            data.append({
                'filename': filename,
                'relevance_score': avgrelevancyScore,
                'coherence_score': avgcoherenceScore,
                'accuracy_score': avgaccuracyScore,
                'fluency_score': avgfluencyScore
            })
        else:
            print(f"Missing file {filename}")

    # saving the results to a CSV file
    df = pd.DataFrame(data)
    savecsvpath = 'ModelAverageScores/Results/'
    os.makedirs(savecsvpath, exist_ok=True)
    csvpath = os.path.join(savecsvpath, 'Llama3.1Results.csv')
    df.to_csv(csvpath, index=False)

    # Calculate the average for each score-related column
    avg = df[['relevance_score', 'coherence_score', 'accuracy_score', 'fluency_score']].mean()

    # storing the average scores
    avgdict = {
        'filename': 'Average',
        'relevance_score': avg['relevance_score'],
        'coherence_score': avg['coherence_score'],
        'accuracy_score': avg['accuracy_score'],
        'fluency_score': avg['fluency_score']
    }

    # converting dictionary to a dataframe
    avgdf = pd.DataFrame([avgdict])

    # adding the average row with the original dataframe
    df = pd.concat([df, avgdf], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csvpath, index=False)

    print(f"\nEvaluation results saved to {csvpath}")

# directories where summaries and source files are stored
summarydir = '../3.LLM Model/LLM Summaries/Selected_LLM_Models_PYPDF2/llama3.1/llama3.1final/'
sourcedir = '../2.PDFExtraction/PYPDF2/PYPDF2textclean/'

evalsummary(summarydir, sourcedir)

print("All summaries have been evaluated successfully.")

# 187 minutes to execute


Running evaluation round 1 for file: INQ000350513.txt
Chunk (1, 1) Relevance Score: 4
Chunk (1, 1) Coherence Score: 5
Chunk (1, 1) Accuracy Score: 4
Chunk (1, 1) Fluency Score: 4
Evaluation for INQ000350513.txt (Round 1):
Relevance: Based on the source text and summary provided, I evaluate the relevance of the summary as follows:

Relevance score: 4/5

The summary accurately captures the main points discussed in the email chain between Chris Whitty, Frank Atherton, Rob Orford, and colleagues. The summary concretely outlines the alternative approach proposed by the author, including "maximum short-term suppression" through severe lockdown measures, aggressive testing, isolation, and release strategy, as well as the need for community testing and control methods.

The only minor deviation from relevance is that the summary does not explicitly mention the Italian example of hospital overcrowding being used to warn the public about the potential consequences of failing to take action agai

## Comparing the Average Model Evaluated Scores

In [9]:
import os
import pandas as pd

# List of file paths
file_paths = [
    'ModelAverageScores/Results/Deepseek-llmResults.csv',
    'ModelAverageScores/Results/Llama3Results.csv',
    'ModelAverageScores/Results/Llama3.1Results.csv',
    'ModelAverageScores/Results/GemmaResults.csv',
    'ModelAverageScores/Results/Gemma2Results.csv',
    'ModelAverageScores/Results/MistralResults.csv',
    'ModelAverageScores/Results/QwenResults.csv',
    'ModelAverageScores/Results/SolarResults.csv',
    'ModelAverageScores/Results/XwinLM.csv',
]

average_rows_with_filenames = []

# Loop through each file and extract the "average" row
for file_path in file_paths:
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Extract the "Average" row
    average_row = df[df['filename'] == 'Average']
    
    # Check if the "Average" row exists
    if not average_row.empty:
        # Create a copy of the row to avoid modifying the original DataFrame slice
        average_row = average_row.copy()
        
        # Add the filename column
        average_row['filename'] = file_path
        
        # Calculate the average of the score columns and add it as a new column
        # Assuming your score columns are named something like 'Score1', 'Score2', 'Score3', etc.
        score_columns = [col for col in df.columns if col not in ['filename']]  # Modify as needed
        average_row['Final average Score'] = average_row[score_columns].mean(axis=1)
        
        # Append the average row with filename and average score to the list
        average_rows_with_filenames.append(average_row)

# Combine all the average rows into a single DataFrame
combined_df = pd.concat(average_rows_with_filenames, ignore_index=True)

# Display the combined DataFrame
print(combined_df)

# Ensure the directory exists
output_dir = 'ModelAverageScores/Results'
os.makedirs(output_dir, exist_ok=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(os.path.join(output_dir, 'AllModelScores.csv'), index=False)


                                             filename  relevance_score  coherence_score  accuracy_score  fluency_score  Final average Score
0  ModelAverageScores/Results/Deepseek-llmResults.csv         3.692593         3.614815        3.722222       3.529630             3.639815
1        ModelAverageScores/Results/Llama3Results.csv         3.592593         3.729630        3.644444       3.781481             3.687037
2      ModelAverageScores/Results/Llama3.1Results.csv         3.214815         3.329630        3.325926       3.059259             3.232407
3         ModelAverageScores/Results/GemmaResults.csv         3.325926         3.559259        2.922222       3.459259             3.316667
4        ModelAverageScores/Results/Gemma2Results.csv         3.711111         3.748148        3.644444       3.485185             3.647222
5       ModelAverageScores/Results/MistralResults.csv         3.881481         3.911111        3.611111       3.414815             3.704630
6          ModelAver

In [None]:
import pandas as pd

# Load the combined DataFrame
combined_df = pd.read_csv('ModelAverageScores/Results/AllModelScores.csv')

# Find the row with the highest score for each metric
best_scores = {
    'relevance_score': combined_df.loc[combined_df['relevance_score'].idxmax()],
    'coherence_score': combined_df.loc[combined_df['coherence_score'].idxmax()],
    'accuracy_score': combined_df.loc[combined_df['accuracy_score'].idxmax()],
    'fluency_score': combined_df.loc[combined_df['fluency_score'].idxmax()]
}

# Print the results
for metric, row in best_scores.items():
    print(f"Best {metric.replace('_', ' ')}:")
    print(f"Filename: {row['filename']}")
    print(f"Score: {row[metric]}\n")


Best relevance score:
Filename: ModelAverageScores/Results/QwenResults.csv
Score: 4.0

Best coherence score:
Filename: ModelAverageScores/Results/Gemma2Results.csv
Score: 4.0

Best accuracy score:
Filename: ModelAverageScores/Results/Gemma2Results.csv
Score: 8.333333333333334

Best fluency score:
Filename: ModelAverageScores/Results/Gemma2Results.csv
Score: 9.0

