<a href="https://colab.research.google.com/github/UG-Team-Data-Science/llm-workshop/blob/main/LLM_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Open-Source Large Language Models for Structured Information Extraction**

Open-source large language models can be used to extract structured infomation from unstructured text. This notebook demonstrates doing so "locally" with the `llama.cpp` library

In [None]:
# @title Connect to Google Drive

# @markdown If you wish to connect to Google Drive, e.g. to load your own data from a folder, check this and  follow the instructions of the pop-up.

connect_to_google_drive = False #@param {"type": "boolean"}

import os
import warnings
from pathlib import Path

try:
  from google.colab import drive
  if connect_to_google_drive:
    drive.mount("/content/gdrive")
  working_dir = Path("/content")
except ImportError:
  # special case for Michiel
  if os.path.exists("/nvme/storage_michiel/llm_workshop"):
    working_dir = Path("/nvme/storage_michiel/llm_workshop")
  else:
    warnings.warn("You're not running this on Google Colab, confirgure the working directory (`working_dir`) to something sensical for your machine")
    working_dir = None

In [None]:
# @title Install `llama-cpp` and download model
%%capture

from huggingface_hub import hf_hub_download

# install llama_cpp if not already
!python3 -c 'from llama_cpp import Llama' 2> /dev/null || (CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python'==0.2.53')
from llama_cpp import Llama

use_model = "openhermes-2.5-mistral-7b.Q5_K_M"
repo_id = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF"

model_filename = hf_hub_download(
    repo_id=repo_id,
    filename=use_model + ".gguf",
    repo_type="model",
    local_dir=working_dir,
    token=False,
)

In [None]:
# @title Instantiate the local LLM
%%capture

llm = Llama(
    model_path=model_filename,
    n_gpu_layers=-1,
    n_ctx=8000,
    random_seed=42,
)
llm.verbose=False

In [None]:
# @title Define helper functions
from pprint import pprint, pp, pformat

template = """<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
"""

def local_llm(prompt, verbose=False, apply_template=True, temperature=0.7, max_tokens=None):
    if apply_template:
        prompt = template.format(prompt=prompt)
    if verbose:
        print(f"Prompt:\n{prompt}")
    response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=0.95)
    return response["choices"][0]["text"]


# Prompting basics

In [None]:
response = local_llm(
    "Write promotional material for a workshop demonstrating use cases of open-source large language models"
)
print(response)

## Chat templates

In [None]:
response = local_llm(
    "In what city is Campus Fryslan located?",
    verbose=True,
)
print(response)

In [None]:
response = local_llm(
    "In what city is Campus Fryslan located?",
    apply_template=False,
    verbose=True,
    temperature=0.0
)
print(response)

In [None]:
formatted_prompt = """<|im_start|>system
Your are a helpful assistant that answers in the style of a pirate.<|im_end|>
<|im_start|>user
In what city is Campus Fryslan located?<|im_end|>
<|im_start|>assistant
"""


response = local_llm(
    formatted_prompt,
    apply_template=False,
    verbose=True,
    temperature=0.0
)
print(response)

## Temperature

In [None]:
prompt = """
I'm organizing a workshop on using LLMs to extract structured information from texts / corpora for non-technical researchers at a university.
Suggest a few catchy titles, free of jargon.
"""

response = local_llm(prompt, temperature=0.0)
print(response)

In [None]:
response = local_llm(prompt, temperature=0.0)
print(response)

In [None]:
response = local_llm(prompt, temperature=0.9)
print(response)

In [None]:
response = local_llm(prompt, temperature=0.9)
print(response)

## Number of input / output tokens

- What is a token?


In [None]:
response = local_llm(prompt, max_tokens=20)
print(response)

In [None]:
# %% capture
!wget https://www.gutenberg.org/cache/epub/100/pg100.txt -P $working_dir
long_text = (working_dir / "pg100.txt").read_text(encoding="utf-8")


In [None]:
long_prompt = "Please summarize the following: \n" + long_text
# response = local_llm(long_prompt)

# Prompt Engineering 101


In [None]:
# @title Zero-shot prompting
prompt = """
Review: I think the workshop was okay.
Sentiment: ?
"""
print(local_llm(prompt))

In [None]:
# @title Few-shot prompting
prompt = """
Review: The workshop was enlightening! Engaging speaker, loads of insights. Excited to apply learnings!
Sentiment: Positive

Review: LLM workshop disappointed. Speaker unprepared, content basic. Would not recommended
Sentiment: Negative

Review: LLM workshop was fantastic! Expert speaker, hands-on activities. Left feeling inspired!
Sentiment: ?
"""
local_llm(prompt)

In [None]:
# @title Chain-of-thought prompting

prompt = """I need 10 eggs to make a cake.
I have one egg in my fridge.
I went to the market and bought two cartons with four eggs each.

Do I have enough eggs now?
"""
print(local_llm(prompt, temperature=0.0))

In [None]:
prompt =  """
I need 10 eggs to make a cake.
I have one egg in my fridge.
I went to the market and bought two cartons with four eggs each.
Do I have enough eggs now?

Think step by step.
"""
print(local_llm(prompt, temperature=0.0))

In [None]:
prompt =  """
I need 10 eggs to make a cake.
I have one egg in my fridge.
I went to the market and bought two cartons with four eggs each.

Do I have enough eggs now?
Think step by step.
Explain each intermediate step.
Only when you are done with all your steps,
provide the answer based on your intermediate steps.
"""
print(local_llm(prompt, temperature=0.0))

In [None]:
prompt = """
I need 10 eggs to make a cake.
I have one egg in my fridge.
I went to the market and bought two cartons with four eggs each.

Do I have enough eggs now?
Think step by step.
Provide the answer as a single yes/no answer first.
Then explain each intermediate step.
"""
print(local_llm(prompt, temperature=0.0))

Now try to experiment with your own prompt! Note that smaller language models work best on "interpolation" - analyzing within the context rather than generating completely new text.

Some tasks to consider
- Classify a piece of text, supplying a list of possible labels.
- Extracting information from a piece of text, e.g. certain characterics associated with a person in the text
- Summarizing

Some tips:
- Use delimiters to separate parts of your input
- Give specific instructions
- Rerun the prompt a few times to get an idea of the variance of the responses
- Investigate the effect of encouraging chain-of-thought


If you're unable to get good results it might very well be due to the limitations of the model we're using here! As a sanity check, you can run your prompt on more powerful models here: https://chat.lmsys.org/?single&model=llama-2-70b-chat (this link loads a 70B Llama 2 model by default, but commercial, closed source models are available as well)

In [None]:
prompt = """
"""
print(local_llm(prompt))


# Scaling up

- Prompt template
- Structure output
- Retry until structure is valid


In [None]:
# @title Fetch Data and Load Into Pandas
# @markdown Based on *Powell-Smith A., Centre for Public Data, Analysis of Unanswered Questions in the UK Parliament (2022), GitHub repository, https://github.com/centreforpublicdata/written-answers.*

%%capture

!wget "http://datascience.web.rug.nl/llm_parliamentary_sample.csv" -P $working_dir

import pandas as pd
df = pd.read_csv(working_dir / "llm_parliamentary_sample.csv")

In [None]:
df.query("votes_diff > 0").head()

In [None]:
first_row = df.query("votes_diff > 0").iloc[0]

## Prompt templates, structuring outputs

- Obtain machine-parseable outputs by requesting a JSON object

In [None]:

prompt_template = """
I will provide you a question and a response given in a parliamentary setting.

The question:
*********
{question}
*********

The answer:
*********
{answer}
*********

Does the response sufficiently answer the question?

Return your answer as a valid JSON object with a single field `final answer` with
a boolean value with your final answer, like {{"final_answer": …}}.
"""

prompt = prompt_template.format(
    question=first_row["question_text"].strip(),
    answer=first_row["answer_text"].strip()
)

response = local_llm(prompt)
print("\nLLM answer: ")
print(response)

response = local_llm(prompt + "\n\Think step by step")
print("\nLLM answer (CoT): ")
print(response)

- Sometimes, chain-of-thought can be prompted more effectively by "jumpstarting" the agent's reponse

In [None]:
formatted_prompt_template = """<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user

I will provide you a question and a response given in a parliamentary setting.

The question:
```
{question}
```

The answer:
```
{answer}
```

Does the response sufficiently answer the question?

Return your answer as a valid JSON object with a single field `final answer` with
a boolean value with your final answer, like {{"final_answer": …}}.
<|im_end|>
<|im_start|>assistant
Let's think step by step:
"""

formatted_prompt = formatted_prompt_template.format(
    question=first_row["question_text"].strip(),
    answer=first_row["answer_text"].strip()
)
print(formatted_prompt)

In [None]:
response = local_llm(formatted_prompt, apply_template=False)
print(response)

# Parsing the answer from the response

In [None]:
# @title Define helper functions


import re
import json
from json import JSONDecodeError

from tqdm import tqdm

json_expression = re.compile(r"\{.+?\}", re.DOTALL)


def can_parse(model_output, output_arguments, output_types=None):
    if output_types is None:
        output_types = dict()
    answers = json_expression.findall(model_output)
    if len(answers) != 1:
        return False
    answer = answers[0]
    try:
        output = json.loads(answer)
        for arg in output_arguments:
            value = output[arg]
            if arg in output_types:
                if not isinstance(value, output_types[arg]):
                    return False
        return True
    except (JSONDecodeError, KeyError):
        return False

def parse_output(model_output):
    answers = json_expression.findall(model_output)
    answer = answers[0]
    return json.loads(answer)


def annotation_loop(
    input_df, apply_template, expected_keys, expected_types=None, n_retries=10
):
    df = input_df.copy()
    df["can_parse"] = False
    for _ in range(n_retries):
        not_parseable = ~df["can_parse"]
        responses = [
            local_llm(prompt, apply_template=apply_template)
            for prompt in tqdm(df.loc[not_parseable, "prompt"])
        ]
        df.loc[not_parseable, "response"] = responses
        df.loc[not_parseable, "can_parse"] = df.loc[not_parseable, "response"].apply(
            can_parse, args=(expected_keys, expected_types)
        )
        if df["can_parse"].all():
            break
    parseable = df["can_parse"]
    df.loc[parseable, "json"] = df.loc[parseable, "response"].apply(parse_output)
    for key in expected_keys:
        df.loc[parseable, key] = df.loc[parseable, "json"].apply(lambda x: x[key])
    return df.drop("json", axis="columns")


In [None]:
df_sampled = pd.concat(
    (
        df.sort_values("votes_diff").iloc[:5],
        df.sort_values("votes_diff", ascending=False).iloc[:5],
    )
).copy()


n_retries = 10

expected_keys = ["final_answer"]
expected_types = {"final_answer": bool}

for idx, row in df_sampled.iterrows():
    df_sampled.loc[idx, "prompt"] = (
        formatted_prompt_template.format(
            question=row.question_text.strip(), answer=row.answer_text.strip()
        )
    )

print(df_sampled["prompt"].iloc[0])

In [None]:
df_annotated = annotation_loop(df_sampled, apply_template=False, expected_keys=expected_keys, expected_types=expected_types)

In [None]:
df_annotated[["final_answer", "votes_diff"]]

In [None]:
selected_index = 533

print(df_annotated.loc[selected_index,"response"], end="\n\n")
print(df_annotated.loc[selected_index, 'question_text'], end="\n\n")
print(df_annotated.loc[selected_index, 'answer_text'], end="\n\n")
