# ContextBased

In [1]:
from transformers import pipeline
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModelForTokenClassification,
)

import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

import os
from dotenv import load_dotenv
from pathlib import Path
from tqdm import tqdm

# Load variables
load_dotenv()
# change dir root (one above)
access_key = os.getenv("HUGGING_FACE")
root_dir = os.getcwd()
model_dir = Path(root_dir, "models")
articles_dir = Path(root_dir, "Articles")


# Assuming you have the `access_key` for authentication with Hugging Face API

## NER
ner_tokenizer = AutoTokenizer.from_pretrained(
    "dslim/bert-base-NER", cache_dir=model_dir
)
ner_model = AutoModelForTokenClassification.from_pretrained(
    "dslim/bert-base-NER", cache_dir=model_dir
).to("cuda")

nlp = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
example = "Give me a resume of Tom Holland's acting career"

ner_results = nlp(example)
print(ner_results)


def extract_entities(entities):
    """
    Extracts persons and locations from a list of entities recognized by an entity recognition system.

    Parameters:
    - entities (list of dicts): The output from an entity recognition system, where each dictionary
      contains details about the recognized entity, including its type (person or location).

    Returns:
    - tuple of two lists: (persons, locations) where each is a list of extracted entity names.
    """
    # Initialize lists to hold persons and locations
    persons = []
    locations = []

    # Temporary variables to construct full names and locations
    current_person = ""
    current_location = ""

    # Iterate over each entity to extract and construct full names and locations
    for entity in entities:
        if entity["entity"].startswith("B-PER"):
            # If there's a current person, append it to persons before starting a new one
            if current_person:
                persons.append(current_person.strip())
                current_person = ""
            current_person += entity["word"].lstrip("##")
        elif entity["entity"].startswith("I-PER") and current_person:
            # Handle token splitting for names correctly
            if entity["word"].startswith("##"):
                current_person += entity["word"].replace("##", "")
            else:
                current_person += " " + entity["word"]
        elif entity["entity"].startswith("B-ORG"):
            # Similarly, for locations
            if current_location:
                locations.append(current_location.strip())
                current_location = ""
            current_location += entity["word"].lstrip("##")
        elif entity["entity"].startswith("I-ORG") and current_location:
            if entity["word"].startswith("##"):
                current_location += entity["word"].replace("##", "")
            else:
                current_location += " " + entity["word"]

    # Append any remaining entities to their respective lists
    if current_person:
        persons.append(current_person.strip())
    if current_location:
        locations.append(current_location.strip())
    words_to_remove = ["like", "the", "and", "or", "but", "so", "for", "in", "at", "on"]
    cleaned_locations = []
    for location in locations:
        cleaned_location = " ".join(
            [word for word in location.split() if word.lower() not in words_to_remove]
        )
        cleaned_locations.append(cleaned_location)
    return persons, cleaned_locations


persons, locations = extract_entities(ner_results)

print(f"Persons: {persons}")
print(f"Locations: {locations}")


## Get context

from langchain_community.tools.wikidata.tool import WikidataAPIWrapper, WikidataQueryRun

wikidata = WikidataQueryRun(api_wrapper=WikidataAPIWrapper())


results = []
# Iterate through each person and location
for item in tqdm(persons + locations, desc=f"Querying Wikidata"):
    # Run the command and append the output to the results list
    result = wikidata.run(item)
    results.append(result)
print(results)
# Load your chosen model
model_name = "HuggingFaceH4/zephyr-7b-alpha"
question_tokenizer = AutoTokenizer.from_pretrained(
    model_name, token=access_key, cache_dir=model_dir
)
question_model = AutoModelForCausalLM.from_pretrained(
    model_name, token=access_key, cache_dir=model_dir
).to(device)

# Define the text generation pipeline
text_generation = pipeline(
    "text-generation",
    model=question_model,
    tokenizer=question_tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

# Define your prompt with the context
context = str(results)
query = example

prompt = f"""
You are an AI Assistant that follows instructions extremely well.
Please be truthful and give direct answers

{context}

{query}
"""

# Generate response
response = text_generation(prompt, max_length=1024, temperature=0.7)[0][
    "generated_text"
]

# Print or process the response as needed
print(response)


cuda


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER', 'score': 0.9997329, 'index': 6, 'word': 'Tom', 'start': 20, 'end': 23}, {'entity': 'I-PER', 'score': 0.9996735, 'index': 7, 'word': 'Holland', 'start': 24, 'end': 31}]
Persons: ['Tom Holland']
Locations: []


Querying Wikidata: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.27s/it]


['Result Q2023710:\nLabel: Tom Holland\nDescription: British actor\nAliases: Thomas Stanley Holland\ninstance of: human\ncountry of citizenship: England, United Kingdom\noccupation: voice actor, stage actor, film actor, actor, dancer, television actor\nsex or gender: male\ndate of birth: 1996-06-01\nplace of birth: Kingston upon Thames\neducated at: BRIT School for Performing Arts and Technology, Wimbledon College, Richard Challoner School, Donhead Preparatory School\nfield of work: acting\nnotable work: The Impossible, Spider-Man, Avengers: Infinity War, In the Heart of the Sea, Avengers: Endgame, Spies in Disguise, Onward, Uncharted, Captain America: Civil War\nfather: Dominic Holland\n\nResult Q1340923:\nLabel: Tom Holland\nDescription: American scriptwriter and director (born 1943)\nAliases: Thomas Lee Holland, Thomas Holland\ninstance of: human\ncountry of citizenship: United States of America\noccupation: film director, lawyer, screenwriter, film actor, director, actor, film prod

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 40.69 MiB is free. Process 8596 has 9.98 GiB memory in use. Process 20423 has 754.00 MiB memory in use. Process 9094 has 528.00 MiB memory in use. Including non-PyTorch memory, this process has 20.46 GiB memory in use. Of the allocated memory 20.08 GiB is allocated by PyTorch, and 79.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## RAG


In [2]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from pathlib import Path
import os
import requests
import wikipediaapi

from dotenv import load_dotenv

# Load variables
load_dotenv()
access_key = os.getenv("HUGGING_FACE")
root_dir = os.path.abspath(os.getcwd())
model_dir = Path(root_dir, "models")
articles_dir = Path(root_dir, "Articles")


# Load text document from file system
def load_text_document(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()
    return content


# Function to get Wikipedia page title from Wikidata QID
def get_wikipedia_title(qid, user_agent):
    headers = {"User-Agent": user_agent}
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={qid}&props=sitelinks&format=json"
    response = requests.get(url, headers=headers)
    data = response.json()
    if "sitelinks" in data.get("entities", {}).get(qid, {}):
        sitelinks = data["entities"][qid]["sitelinks"]
        if "enwiki" in sitelinks:
            return sitelinks["enwiki"]["title"]
    return None


# Function to fetch Wikipedia article content based on page title
def fetch_wikipedia_article(title, num_words, user_agent):
    headers = {"User-Agent": user_agent}
    wiki_wiki = wikipediaapi.Wikipedia("en", headers=headers)
    page = wiki_wiki.page(title)
    if page.exists():
        article_content = page.text
        # Extract specified number of words
        words = article_content.split()[:num_words]
        return " ".join(words)
    return None


# Function to download Wikipedia articles
def download_wikipedia_article(qid, user_agent, num_words_to_save):
    title = get_wikipedia_title(qid, user_agent)
    if title:
        content = fetch_wikipedia_article(title, num_words_to_save, user_agent)
        if content:
            # Save article content to disk
            root_dir = Path(".")
            directory = root_dir / "Articles"
            os.makedirs(directory, exist_ok=True)
            filename = directory / f"{title}.txt"
            with open(filename, "w", encoding="utf-8") as file:
                file.write(content)
        else:
            print("Failed to fetch article content.")
    else:
        print(f"No Wikipedia article found for QID: {qid}")


# Define folder path
folder_path = "/work3/s174159/ET_LLM_RAG/Articles/"

# Encode articles in the folder
articles = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        content = load_text_document(file_path)
        articles.append(content)

# Split text into chunks for each article
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=1)
chunking = [text_splitter.split_text(article) for article in articles]


# Define a class to represent documents
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata


# Convert dictionaries into objects with 'page_content' attribute
documents = [Document(chunk, metadata=None) for chunks in chunking for chunk in chunks]

# Embed chunks using the specified embedding model
access_key = os.getenv("HUGGING_FACE")
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=access_key, model_name="BAAI/bge-base-en-v1.5"
)
vectorstore = Chroma.from_documents(documents, embeddings)

# Define retriever
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 2})

# Perform retrieval-based question answering task
query = "Can you give a list of supervised students of Lars Kai Hansen"
docs_rel = retriever.get_relevant_documents(query)

# Generate response to a given query using augmented knowledge base
model = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    model_kwargs={
        "temperature": 0.1,
        "max_new_tokens": 1024,
        "max_length": 512,
        "cache_dir": model_dir,
    },
    huggingfacehub_api_token=access_key,
)
qa = RetrievalQA.from_chain_type(llm=model, retriever=retriever)
prompt = f"""
You are an AI Assistant that follows instructions extremely well.
Please be truthful and give direct answers
</s>

{query}
</s>
"""
response = qa(prompt)
print(response["result"])


  warn_deprecated(
  warn_deprecated(


TypeError: Object of type PosixPath is not JSON serializable