# Authors
Niklas Roslund 

Wiljam Wilmi

## Imports

In [None]:
from pdfminer.high_level import extract_text
import os
import re
from transformers import AutoTokenizer, AutoModel
import torch
import sqlite3
import pickle
from scipy.spatial.distance import cosine
import requests
import json
#import pandas as pd    #Used for extracting information from excel file
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Create embeddings database from PDF-text

In [None]:
def database_exists(db_path):
    return os.path.exists(db_path)

# TODO: Try other models for tokenizing?
def initialize_model(model_name='thenlper/gte-large'):
    """
    Initialize tokenizer and model for embeddings, and set the device.
    
    Args:
    - model_name: The name of the model to initialize.
    
    Returns:
    - tokenizer: Initialized tokenizer.
    - model: Initialized model.
    - device: The device being used (cuda or cpu).
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    model = AutoModel.from_pretrained(model_name).to(device)
    return tokenizer, model, device

def initialize_database(db_path):
    """Initialize SQLite database and create table."""
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS sentence_embeddings (
                    sentence TEXT, 
                    embedding BLOB)''')
    conn.commit()
    conn.close()

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path) 

# This sliding window makes the chunks of text from the pdf.
# Change chunk_size and overlap to your preferred choice.
def sliding_window_tokenize(text, chunk_size=300, overlap=200):
    chunks = []
    i = 0
    while i < len(text) - chunk_size:
        chunks.append(text[i:i+chunk_size])
        i += (chunk_size - overlap)
    chunks.append(text[i:])  # Add the last chunk
    return chunks

# The relevant terms are supposed to be more than "sustainability" but other keywords have been omitted.
def clean_sentences(sentences, min_length=5, relevant_terms = {'sustainability'}):
    cleaned_sentences = []
    for sentence in sentences:
        # Remove non-textual elements and correct formatting
        sentence = re.sub(r'\s+', ' ', sentence).strip()

        # Filter out too short or non-informative sentences
        if len(sentence.split()) < min_length:
            continue
        
        # Semantic filtering based on key terms
        if not any(term in sentence.lower() for term in relevant_terms):
            continue

        # Add the cleaned, relevant sentence
        cleaned_sentences.append(sentence)
    
    # Deduplication
    unique_sentences = list(set(cleaned_sentences))
    
    return unique_sentences

def generate_and_store_embeddings(text, db_path, tokenizer, model, device):
    sentences = sliding_window_tokenize(text)
    sentences = clean_sentences(sentences)
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        sentence_embedding = outputs.last_hidden_state.mean(dim=1)
        emb_blob = pickle.dumps(sentence_embedding.cpu().numpy())
        
        c.execute("INSERT INTO sentence_embeddings (sentence, embedding) VALUES (?, ?)", (sentence, emb_blob))
    
    conn.commit()
    conn.close()

In [None]:
# Choose prefix for which document is to be read.
# Here you can choose which company you want to evaluate. 
# Preset to do all 50 included in the project

company_names = []
company_names = ("Afry", "Ageas", "Airbus", "AkerSolutions", "Amcor", "Arla", "Barclays", "Betsson", "BlackRock", "Carlsberg", "Colruyt", "Diageo", "Dufry", "Eaton", "Elanders", "Fortum", "Glencore", "Granges", "Hays", "Husqvarna", "Hydro", "Inditex", "JeronimoMartins", "Kerry", "Leonardo", "Lundbeck", "Maersk", "NationalGrid", "NatwestHoldings", "Norwegian", "Orange", "Peab", "Postnord", "Randstad", "Richemont", "Rockwool", "RollsRoyce", "RoyalMail", "Securitas", "Siemens", "Storebrand", "Telefonica", "Tesco", "Thyssenkrupp", "Vattenfall", "Viterra", "Vodafone", "Rewe", "Preem", "BASF")

for company_name in company_names:

    # If you have other files these paths might need to change
    data_path = f"data/{company_name}_2022.pdf"
    db_path = f'embeddings/{company_name}_embeddings.db'

    if not database_exists(db_path):
        # Generate embeddings
        tokenizer, model, device = initialize_model()
        initialize_database(db_path)
        extracted_text = extract_text_from_pdf(data_path)
        generate_and_store_embeddings(extracted_text, db_path, tokenizer, model, device)
    else:
        print(f"Embeddings database for {company_name} already exists. Skipping embeddings generation.")

In [None]:
# Here you add all the questions for the LLM.
# The real questions have been omitted.

questions = []
questions.append("does the report mention sustainability")

In [None]:
# Here you add all relevant keywords for the question.
# Be sure to have same amount of keyword items as question items as they are paired.
# Same here, all the real keywords have been omitted.

keywords = []
keywords.append("sustainability, report, policy")

## Retrieve and Analyze embeddings

In [None]:
def cosine_similarity(v1, v2):
    """Calculate the cosine similarity between two vectors."""
    return 1 - cosine(v1, v2)

def connect_to_db(company_name):
    """Establish a connection to the SQLite database."""
    conn = sqlite3.connect(f'embeddings/{company_name}_embeddings.db')
    return conn

def fetch_embeddings(conn):
    """Fetch all sentences and embeddings from the database."""
    c = conn.cursor()
    c.execute("SELECT sentence, embedding FROM sentence_embeddings")
    return c.fetchall()

def convert_query_to_embedding(query, tokenizer, model, device):
    """Convert query into embedding."""
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

def calculate_similarities_old(rows, query_embedding):
    """Calculate cosine similarities between query embedding and database embeddings."""
    similarities = []
    for sentence, emb_blob in rows:
        embedding = pickle.loads(emb_blob).flatten()
        similarity = cosine_similarity(query_embedding.flatten(), embedding)
        similarities.append((sentence, similarity))
    return similarities

def calculate_similarities(rows, query_embeddings):
    """Calculate average cosine similarities between multiple query embeddings and database embeddings."""
    similarities = []
    for sentence, emb_blob in rows:
        embedding = pickle.loads(emb_blob).flatten()
        individual_similarities = [cosine_similarity(query_emb.flatten(), embedding) for query_emb in query_embeddings]
        average_similarity = sum(individual_similarities) / len(individual_similarities)
        similarities.append((sentence, average_similarity))
    return similarities


def select_top_matches(similarities, top_n):
    """Select top N results based on similarity."""
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

def generate_corpus(top_matches):
    """Generate a corpus of documents based on the top matches."""
    corpus = [f"Sentence: {sentence}" for sentence, _ in top_matches]
    return corpus

In [None]:
# How many documents the model should look at
nr_of_documents = 5
documents_by_company = []
tokenizer, model, device = initialize_model()

for company_name in company_names:
    corpus_of_documents = []
    conn = connect_to_db(company_name)
    rows = fetch_embeddings(conn)
    for question, keyword in zip(questions, keywords):
        query_embedding = convert_query_to_embedding(keyword, tokenizer, model, device) 
        similarities = calculate_similarities(rows, query_embedding)
        top_matches = select_top_matches(similarities, nr_of_documents)
        corpus_of_documents.append(generate_corpus(top_matches))
    documents_by_company.append(corpus_of_documents)
    conn.close()


# Displaying the top matches
#for doc in corpus_of_documents:
#    print(doc)

In [None]:
# concatenates the five documents into one element.
company_relevant_documents_text = []
for company_documents in documents_by_company:
    relevant_documents_text = []
    for corpus in company_documents:
        relevant_document = corpus[:nr_of_documents]
        relevant_documents_text.append("\n\n---\n\n".join(relevant_document))
    company_relevant_documents_text.append(relevant_documents_text)

## Prompts

In [None]:
# Here you create the prompt and it is possible to change this to alter the results.
# Some of the information has been removed from the prompt but this is the shell to make it work.

def getPrompts(relevant_documents_text, questions):
    prompts = []
    prompts.append(
        f"""
    Context:
    Your task is to determine if the provided documents adequately address the posed query. 
                   
    Directly answer 'No' or 'Yes' at the beginning of your response. No introductory phrases are needed.
    If uncertain, answer 'No'.
    
    Documents provided:
    {relevant_documents_text[0]}

    Question:
    {questions[0]}

    """)
    return prompts

## Generating answers from the LLM

In [None]:
# Ollama API call (ollama needs to be running on computer with selected model downloaded)
# In this instance you need to download Ollama and mistral:instruct
# It is possible to change the model just by changing the name "model": "mistral:instruct"
# https://ollama.com/library/mistral:instruct

url = 'http://localhost:11434/api/generate'
headers = {'Content-Type': 'application/json'}
company_responses = []  # This will be a list of dictionaries
for company_name, document_by_company in zip(company_names, documents_by_company):
    prompts = getPrompts(document_by_company, questions)
    transformed_responses = []  # To store Yes/No as 1/0 for current company
    combined_response = []
    output_ = []
    for prompt, question, relevant_document in zip(prompts, questions, document_by_company):
        data = {
            # Here you change the model
            "model": "mistral:instruct",
            "prompt": prompt.format(question=question, relevant_document=relevant_document)
        }
        response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
        full_response = []
        try:
            for line in response.iter_lines():
                if line:
                    decoded_line = json.loads(line.decode('utf-8'))
                    if 'response' in decoded_line:
                        full_response.append(decoded_line['response'])
        finally:
            response.close()

        # Join the collected lines into a single string and extract the first line
        combined_response = ''.join(full_response)
        first_line = full_response[0].strip().lower()
        #first_line = combined_response('\n', 1)[0].strip().lower()  # Strip to remove whitespace
        normalized_line = first_line.replace("*", "").replace("#", "")
        output_.append(combined_response)
        
        # This code is still a bit bad because sometimes the model doesnt answer with yes or no in the beginning
        # Some models like to answer "based on...." in that case we just give the answer a "no" or 0.
        # This rarely happens for some models and some do it all the time so important to check these.
        # Might be possible with other methids but this one was best for our purpose.
        if "no" in normalized_line:
            print("no = " + normalized_line)
            answer = 0
        elif "yes" in normalized_line:
            print("yes = " + normalized_line)
            answer = 1
        else:
            print("no = " + normalized_line)
            answer = 0
        
        transformed_responses.append(answer)
    print(company_name)
    for a in output_:
        print(a)
    # After collecting all responses for the current company, append to company_responses
    company_responses.append({'company_name': company_name, 'questions_answers': transformed_responses})


# You will get all the answers under here first one time without the rationale and just the yes or no answers.
# Second time you will recieve full answers with the rationale from the model. 

## Extraction of data from Excel
#### After this we have a method to evaluate the models against old data but the data has been omitted so the code for evaluation is still there but the data is not so the code won't run. However, if you run new documents they will not have the real answers so this evaluation is only for testing.

## Evaluation

In [None]:
# This code will not run without the data but will be kept if anyone is intrested.
# It would print out confusion matrix, classification report and accuracy of the model.


company_responses_dict = {item['company_name']: item['questions_answers'] for item in company_responses}

results = []
all_true = []
all_pred = []

# Iterate through structured_data to match and calculate metrics for company_responses
for structured_item in structured_data:
    company_name = structured_item['company_name']
    if company_name in company_responses_dict:
        true_answers = structured_item['questions_answers']
        pred_answers_raw = company_responses_dict[company_name]

        # Handle None values by converting them to 0
        pred_answers = [0 if answer is None else answer for answer in pred_answers_raw]

        # Append answers for overall accuracy calculation
        all_true.extend(true_answers)
        all_pred.extend(pred_answers)

        # Calculate accuracy for each company
        if len(true_answers) == len(pred_answers):
            company_accuracy = accuracy_score(true_answers, pred_answers)
            results.append({
                'company_name': company_name,
                'accuracy': company_accuracy
            })
        else:
            print(f"Length mismatch for {company_name}")

# Overall accuracy and other metrics
overall_accuracy = accuracy_score(all_true, all_pred)
print(f"Overall Accuracy: {overall_accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(all_true, all_pred))
print("Classification Report:")
print(classification_report(all_true, all_pred))

# Output the results for each company
for result in results:
    print(result)

# Now let's handle the question-wise accuracy
num_questions = len(structured_data[0]['questions_answers'])  # Assuming all entries have the same number of questions
all_true_by_question = [[] for _ in range(num_questions)]
all_pred_by_question = [[] for _ in range(num_questions)]

# Populate the lists for each question
for structured_item in structured_data:
    company_name = structured_item['company_name']
    if company_name in company_responses_dict:
        true_answers = structured_item['questions_answers']
        pred_answers = [0 if answer is None else answer for answer in company_responses_dict[company_name]]

        if len(true_answers) == len(pred_answers):
            for i in range(num_questions):
                all_true_by_question[i].append(true_answers[i])
                all_pred_by_question[i].append(pred_answers[i])
print("\n")
# Calculate and print accuracy for each question
for i in range(num_questions):
    question_accuracy = accuracy_score(all_true_by_question[i], all_pred_by_question[i])
    print(f"Accuracy for Question {i + 1}: {question_accuracy}")