In [None]:
# llama-2 summarization for JSON input

In [1]:
pip install openai==0.28

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
import openai

In [10]:
def chat(Messages):
    response = openai.ChatCompletion.create(
    model="ollama/llama2:70b",
    messages=Messages
    )
    return response.choices[0].message.content

In [11]:
import tiktoken 
from transformers import AutoTokenizer

# Returns the number of tokens from the input
def num_tokens_from_string(input_string: str) -> int:
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-chat-hf")
    token_integers = tokenizer.encode(input_string)
    no_tokens = len(token_integers)
    
    return no_tokens

In [12]:
def make_chunk(start_node, nodes, edges, max_tokens, summary_obtained):
    # Initialize an empty chunk
    chunk = {"nodes": [], "edges": []}
    total_tokens = 0
    node_tokens = 0
    edge_tokens = 0

    start_processing = False

    for node in nodes:
        if node == start_node:
            start_processing = True

        if start_processing:
            node_tokens = num_tokens_from_string(str(node))

            # Check if adding the current node exceeds the token limit
            if total_tokens + node_tokens <= max_tokens:
                chunk["nodes"].append(node)
                total_tokens += node_tokens
            else:
                print('limit breached')
                next_start = node
                return chunk, next_start
    
            # Add corresponding edges for the node
            node_edges = [edge for edge in edges if edge["source"] == node["id"]]
            if node_edges:
                edge_value = node_edges[0]
                edge_tokens = num_tokens_from_string(str(edge_value))
                chunk["edges"].append(edge_value)
                total_tokens += edge_tokens

    next_start = None
    return chunk, next_start

In [13]:
def chunk_input(input, max_tokens):
    nodes = input["nodes"]
    edges = input["edges"]
    chunks = []
    summary_obtained = ""

    # start node is the first node
    start_node = nodes[0]

    while start_node is not None:
        # Create a chunk
        chunk, next_start_node = make_chunk(start_node, nodes, edges, max_tokens, summary_obtained)
        # print(next_start_node)

        # store the current chunk
        chunks.append(chunk)

        # Update start_node for the next iteration
        start_node = next_start_node

    for i, chunk in enumerate(chunks):
        chunk_filename = f'chunk_{i + 1}_filename.json'
        with open(chunk_filename, 'w') as file:
            json.dump(chunk, file, indent=2)
    return chunks

In [14]:
def create_summary(chunks=None, chat_model="gpt-4"):
    prev_summary = ""
    total_tokens_used = 0
    tokens_used = 0
    responses = []

    for chunk in chunks:
        # Combine previous summary and current chunk for summarization
        summary_chunk = prev_summary + str(chunk)

        #print('summary_chunk',summary_chunk)

        messages = [
                {"role": "system", "content": "As a graph summarizer, summarize the debate's key points in 90 words from a JSON based argument graph. The JSON includes 'nodes' with node id and text, and 'edges' connect source and target nodes as 'Pro' or 'Con.'. Concentrate solely on the debate and summarize without mentioning about the graph structure, node id, or edges. Don't exceed the 90-word limit or add graph related information in the summary otherwise, you will be punished"},
                {"role": "user", "content": summary_chunk},
            ]

        summary = chat(messages)

        #print('summary',summary)
        # Update previous summary for the next iteration
        prev_summary = summary  

    return summary

In [15]:
import os
import json

# function to porcess json input and create llama2 summaries
def create_llama2_summaries(input_folder, output_folder):

    # Iterate through files in the input folder
    for filename in os.listdir(input_folder):
        print(filename)
        if filename.endswith(".json"):
            input_file_path = os.path.join(input_folder, filename)

            # read the content of the file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                file_content = json.load(file)

            chunks = chunk_input(file_content, 3500)
            final_summary = create_summary(chunks = chunks)
    
            # write the concatenated result to a text file in the output folder
            output_filename = os.path.splitext(filename)[0] + '.txt'
            output_file_path = os.path.join(output_folder, output_filename)
            with open(output_file_path, 'w', encoding="utf8") as output_file:
                output_file.write(final_summary)

In [16]:
#input_folder_path = "C:/Users/Nila/Documents/Project/benchmark/data/kialo-nilesc/sample-test/input"
#output_folder_path = "C:/Users/Nila/Documents/Project/benchmark/data/kialo-nilesc/sample-test/output"

## json resend folder

input_folder_path = "C:/Users/Nila/Documents/Main Scripts_25.12.2023/Jupyter Notebooks/complete testing/llm_output_updated/"
#input_folder_path = "C:/Users/Nila/Documents/Main Scripts_25.12.2023/Jupyter Notebooks/missed_input_json1"
output_folder_path = "C:/Users/Nila/Documents/Main Scripts_25.12.2023/Jupyter Notebooks/complete testing/llm_output_updated/LLAMA2/json_resend"

create_llama2_summaries(input_folder_path, output_folder_path)

are-constructed-languages-useful-and-do-we-need-more-of-them-16548.json


KeyboardInterrupt: 

In [71]:
#The token limit for "llama-2" is 4096 tokens. With each step, the preceding summary adds 100 tokens. 
#The resultant summary will consist of 100 tokens. The prompt itself utilizes 44 tokens, and some tokens are allocated
#for internal processing. Therefore, a suitable number for chunking would be 3600 tokens.
