In [1]:
from langchain.schema import Document
import json


def load_docs_from_jsonl(file_path):
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array


docs = load_docs_from_jsonl('data_finish.jsonl')
len(docs)

194725

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)

split_docs = text_splitter.split_documents(docs)

len(split_docs)

305083

In [None]:
split_docs[0]

In [51]:
from collections.abc import Iterable
import ollama

def create_qa(content):
    response = ollama.chat(model='llama3.1:8b-instruct-q4_0',format='json', messages=[
        {
            'role': 'system',
            'content': 'You are an API that converts bodies of text into a list of 1 to 5 different question and answer about technical info into a JSON format. Each item in this array '
                       'contains a single question with a single answer. Only respond with the JSON and no additional text. Do not generate questions about specific info like names, institutions or references.'
                       'Questions should include all context needed to understand. Do not create questions like this: "What is the purpose of this article/<article name>?" '
                       'Answers must have as much possible of context, do not answer with just one phrase. '
                       'Format exemple: {"result": [{"question": "a list of strings", "answer": "a list of strings"}, {"question": "a list of strings", "answer": "a list of strings"}, {"question": "a list of strings", "answer": "a list of strings"} ]}',
        },
        {
            'role': 'user',
            'content': (f'This is a part of article named: {content.metadata["Title"]}\n' if "Title" in content.metadata else "") + content.page_content + f'\n\nRespond only with valid JSON'
        },
    ])
    try:
        created_qa = json.loads(response['message']['content']).get('result')
        # validate keys
        if not isinstance(created_qa, Iterable) or not all(set(c.keys()) == {'answer', 'question'} for c in created_qa):
            print('invalid format generated: "', created_qa, '"')
            created_qa = {}
    except Exception as err:
        print('error while loading json: "', str(err),'" response: ', response['message']['content']                                        )
        created_qa = {}
    
    return created_qa

generated_qas = []

for chunk in split_docs[:100]:
    generated_qas.extend(create_qa(chunk))

transformed_data = [
    {
        'instruction': item['question'],
        'input': '',
        'output': item['answer']
    }
    for item in generated_qas
]

with open('protein_articles.json', 'w') as json_file:
    json_file.write(json.dumps(transformed_data,indent=4), )

In [2]:
import json

with open('dataset_protein_articles.json', 'r') as json_file:
    json_dataset = json.loads(json_file.read())
len(json_dataset)

39840