# This is only for mac (ARM64)

## Import llama3-8b

In [1]:
import time
from mlx_lm import load, generate
model, tokenizer = load("mlx-community/Meta-Llama-3-8B-Instruct-4bit")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

## Example

In [2]:
SYSTEM_MSG = ("You are an assistant that detects entities and their relationships in questions, for example:"
              "user question: where is china?"
              "your answer: [(china)(located in)(?)]"
              "user question: where is USA?"
              "Your answer: [(USA)(located in)(?)]"
              "user question: where is UK's capital?"
              "your answer: [(UK's capital)(located in)(?)]")

def generate_entity_response(promptStr, maxTokens=100):
    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": promptStr},
    ]
    # print(messages)
    # 将消息应用于聊天模板并生成输入ID
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    prompt = tokenizer.decode(input_ids)

    # 生成响应
    response = generate(model, tokenizer, prompt=prompt, max_tokens=maxTokens)

    return response


# example
user_question = "Where is China?"
response = generate_entity_response(user_question)
print(response)

# other question
user_question = "where is japan's capital"
response = generate_entity_response(user_question)
print(response)

[(China)(located in)(?)]
[(Japan's capital)(located in)(?)]


## Train a llama which can learn rdf
#### first, initial database and get all rdf data

In [3]:
from jena.fuseki_client import JenaClient
from mongoDB.mongoDB_client import init_db, MongoDBInterface
db, fs = init_db(
    "mongodb://localhost:27017")
db_interface = MongoDBInterface(db, fs)

jena_client = JenaClient(jena_url='http://127.0.0.1:3030', dataset='test')  

In [4]:
import json
code,text=jena_client.execute_sparql_query_global("SELECT * WHERE { ?sub ?pred ?obj .}")
# print("text: ",text)

def rdf_to_natural_language(rdf_data):
    descriptions = []
    for s,o,p in rdf_data:
        subj = s.split('/')[-1]
        pred = p.split('/')[-1].replace('_', ' ')
        obj = o.split('/')[-1]
        description = f"{subj} {pred} {obj}."
        descriptions.append(description)
    return "\n".join(descriptions)

rdf_to_nl=""
if code == 200:
    rdf_to_nl=rdf_to_natural_language(text)
    

In [5]:
model, tokenizer = load("mlx-community/Meta-Llama-3-8B-Instruct-4bit")

# 准备初始上下文
SYSTEM_MSG = (f"You are a knowledgeable assistant who answers questions based on the provided data, "
              f"If the user's question is out of scope for this dataset, you should only answer: Sorry, this question is out of scope."
              f"\n\nHere is the data:\n{rdf_to_nl}")


# 生成回答
def generate_response(question, initial_context, max_tokens=150):
    messages = [
        {"role": "system", "content": initial_context},
        {"role": "user", "content": question},
    ]

    # 将消息应用于聊天模板并生成输入ID
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    prompt = tokenizer.decode(input_ids)

    
    # 生成响应
    response = generate(model, tokenizer, prompt=prompt, max_tokens=max_tokens)
    

    return response

# example
user_question = "Where is country1?"
response = generate_response(user_question, SYSTEM_MSG)
print(f"question: {user_question}\nLlama response: {response}\n")

# question which out of scope
user_question = "Where is USA"
response = generate_response(user_question, SYSTEM_MSG)
print(f"question: {user_question}\nLlama response: {response}")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

question: Where is country1?
Llama response: Sorry, this question is out of scope.

question: Where is USA
Llama response: According to the provided data, USA is mentioned in the following statements:

* Nicholas_J._Sinnott rdf-schema#comment: He was later appointed by President Calvin Coolidge to be a Judge on the Court of Claims, serving from 1928 to 1929.
* Redfish_Lake rdf-schema#comment: It is the largest lake within the Sawtooth National Recreation Area.
* Interchange_Cable_Network rdf-schema#comment: The first cable, ICN1 (Interchange Cable Network 1) links Fiji to Vanuatu and has been in service since 15 January 2014.


In [6]:
# 将消息应用于聊天模板并生成输入ID
question = "i want some information of country 1"
messages=[{'role':'user','content':question}]
input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
prompt = tokenizer.decode(input_ids)
response = generate_response(question, SYSTEM_MSG)
print(response)

Sorry, this question is out of scope.


## version 2

In [7]:
# import json
# from mlx_lm import load, generate
# 
# # 从 Jena 客户端获取 RDF 数据
# code, text = jena_client.execute_sparql_query_global("SELECT * WHERE { ?sub ?pred ?obj .}")
# 
# # 初始化 LLaMA 模型
# rdf_descriptions=""
# if code == 200:
#     json_object = json.loads(text)
#     result = json_object['results']['bindings']
#     rdf_descriptions = rdf_to_natural_language(result)
# 
#     model, tokenizer = load("mlx-community/Meta-Llama-3-8B-Instruct-4bit")
# 
# # 准备初始上下文
# def generate_system_message_chunk(chunk):
#     return (f"You are a knowledgeable assistant who answers questions based on the provided data. "
#             f"If the user's question is out of scope for this dataset, you should only answer: Sorry, this question is out of scope."
#             f"\n\nHere is the data:\n{chunk}")
# 
# # 分块生成响应
# def generate_response_in_chunks(question, rdf_data, chunk_size=50, max_tokens=100):
#     responses = []
#     
#     
#     for i in range(0, len(rdf_data), chunk_size):
#         chunk = "\n".join(rdf_data[i:i + chunk_size])
#         system_message = generate_system_message_chunk(chunk)
# 
#         messages = [
#             {"role": "system", "content": system_message},
#             {"role": "user", "content": question},
#         ]
# 
#         input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
#         prompt = tokenizer.decode(input_ids)
# 
#         # response
#         response = generate(model, tokenizer, prompt=prompt, max_tokens=max_tokens)
#         responses.append(response['generated_text'])
#     
#     # combine response
#     final_response = " ".join(responses)
#     return final_response
# 
# # example 
# user_question = "Where is country1?"
# response = generate_response_in_chunks(user_question, rdf_descriptions)
# print(f"Question: {user_question}\nLlama response: {response}\n")
# 
# # example question that out of scope
# user_question = "Where is USA?"
# response = generate_response_in_chunks(user_question, rdf_descriptions)
# print(f"Question: {user_question}\nLlama response: {response}")

## Evaluation

#### split dataset

In [8]:
import os
import psutil
from rdflib import Graph

def check_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"Memory Usage: {mem_info.rss / 1024 ** 2:.2f} MB")
    


def split_rdf_file(input_file_path, split_sizes):
    # Load the RDF data
    g = Graph()
    g.parse(input_file_path, format="ttl")  # Assuming your file is in Turtle format

    # Convert graph to a list of triples
    triples = list(g)

    # Calculate the indices where to split the triples
    indices = [sum(split_sizes[:i+1]) for i in range(len(split_sizes))]

    # Initialize the starting index
    start_index = 0

    for i, end_index in enumerate(indices):
        # Create a new graph for each segment
        split_graph = Graph()

        # Take the slice of triples for this segment
        segment_triples = triples[start_index:end_index]

        # Add triples to the new graph
        for triple in segment_triples:
            split_graph.add(triple)

        # Save the new graph to a file
        split_graph.serialize(destination=f"rdf_data/test_dataset/segment_{i+1}.ttl", format="ttl")

        # Update start index for next segment
        start_index = end_index

    print("Splitting complete. Segments saved to segment_*.ttl files.")


In [9]:
# split the dataset, the "rdf_data/short-abstracts_lang=en.ttl" ia available in  
# split_sizes = [10, 50, 100]  # Sizes for each segment
# split_rdf_file("rdf_data/short-abstracts_lang=en.ttl", split_sizes)

## Response Time Test

In [10]:
def transfer_graph_to_nl(testData_path):
    g = Graph()
    g.parse(testData_path, format="ttl")
    descriptions=[]
    for s,o,p in g:
        # print(s,o,p)
        subj = s.split('/')[-1].replace('_', ' ')
        pred = p.split('/')[-1].replace('_', ' ')
        obj = o.split('/')[-1].replace('_', ' ')
        description = f"{subj} {pred} {obj}."
        descriptions.append(description)
    return "\n".join(descriptions)
# read data
data_10=transfer_graph_to_nl("rdf_data/test_dataset/segment_1.ttl")
data_50=transfer_graph_to_nl("rdf_data/test_dataset/segment_2.ttl")
data_100=transfer_graph_to_nl("rdf_data/test_dataset/segment_3.ttl")

In [11]:
# change system message and input test data 1
SYSTEM_MSG1= (f"You are a knowledgeable assistant who answers questions based on the provided data, "
              f"If the user's question is out of scope for this dataset, you should only answer: Sorry, this question is out of scope."
              f"\n\nHere is the data:\n{data_10}")
user_question = "where is Redfish Lake?"
start_time = time.time()
response = generate_response(user_question, SYSTEM_MSG1)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Response Time: {elapsed_time:.4f} Second")
print(f"question: {user_question}\nLlama response: {response}\n")

Response Time: 4.8251 Second
question: where is Redfish Lake?
Llama response: Redfish Lake is located in Custer County, Idaho, just south of Stanley.



In [12]:
# change system message and input test data 1
SYSTEM_MSG2= (f"You are a knowledgeable assistant who answers questions based on the provided data, "
              f"If the user's question is out of scope for this dataset, you should only answer: Sorry, this question is out of scope."
              f"\n\nHere is the data:\n{data_50}")
user_question = "where is Chamisso Wilderness?"
start_time = time.time()
response = generate_response(user_question, SYSTEM_MSG2)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Response Time: {elapsed_time:.4f} Second")
print(f"question: {user_question}\nLlama response: {response}\n")

Response Time: 16.5193 Second
question: where is Chamisso Wilderness?
Llama response: Sorry, this question is out of scope.



In [13]:
# change system message and input test data 1
SYSTEM_MSG3= (f"You are a knowledgeable assistant who answers questions based on the provided data, "
              f"If the user's question is out of scope for this dataset, you should only answer: Sorry, this question is out of scope."
              f"\n\nHere is the data:\n{data_100}")
user_question = "what is the Utto?"
start_time = time.time()
response = generate_response(user_question, SYSTEM_MSG3)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Response Time: {elapsed_time:.4f} Second")
print(f"question: {user_question}\nLlama response: {response}\n")

Response Time: 43.8850 Second
question: what is the Utto?
Llama response: Sorry, also known as the Republic of Serbia.



## Generalisation Test

In [14]:
SYSTEM_MSG3= (f"You are a knowledgeable assistant who answers questions based on the provided data, "
              f"If the user's question is out of scope for this dataset, you should only answer: Sorry, this question is out of scope."
              f"\n\nHere is the data:\n{data_10}")


user_question = "what is the Gonzalez Spur?"
response = generate_response(user_question, SYSTEM_MSG3)
print(f"question: {user_question}\nLlama response: {response}\n")

user_question = "what is the Beyond the Sixth Seal?"
response = generate_response(user_question, SYSTEM_MSG3)
print(f"question: {user_question}\nLlama response: {response}\n")

user_question = "what is the Knockgraffon?"
response = generate_response(user_question, SYSTEM_MSG3)
print(f"question: {user_question}\nLlama response: {response}\n")

user_question = "who is Nicholas J. Sinnott?"
response = generate_response(user_question, SYSTEM_MSG3)
print(f"question: {user_question}\nLlama response: {response}\n")

user_question = "where is the Redfish Lake?"
response = generate_response(user_question, SYSTEM_MSG3)
print(f"question: {user_question}\nLlama response: {response}\n")

question: what is the Gonzalez Spur?
Llama response: According to the provided data, the Gonzalez Spur is a prominent rock spur located in the Olympus Range of the McMurdo Dry Valleys, Antarctica. It is 2.5 nautical miles (5 km) long and extends east-southeast from the 1,700-meter (5,600 ft) high Goldich Crest.

question: what is the Beyond the Sixth Seal?
Llama response: Beyond the Sixth Seal is a side-project band of Mike McKenzie and Greg Weeks of The Red Chord. They were formed as a thrash metal band in the suburbs of Boston in 1998 and released a few demos.

question: what is the Knockgraffon?
Llama response: According to the provided data, Knockgraffon is a townland in County Tipperary, Ireland. It is also the name of a civil parish and a Roman Catholic ecclesiastical parish.

question: who is Nicholas J. Sinnott?
Llama response: Nicholas John Sinnott (December 6, 1870 – July 20, 1929) was an American lawyer and politician who served as a United States representative from Oregon 