In [1]:
#This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, 
# as those would generically seem to be the strongest semantically related pieces of text.

In [37]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
import json
import os
from langchain.vectorstores import FAISS

In [50]:
# Define chunk size, overlap and separators
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=10,
    separators=['\n\n', '\n', '(?=>\. )', ' ', '']
)

In [51]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="allenai-specter") # this is specialised for scientific articles

In [63]:
# Read JSON files from a folder
folder_path = 'dataset/json'
# store json files in a list
json_files = [f"{folder_path}/{file}" for file in os.listdir(folder_path) if file.endswith('.json')]

In [78]:
# Initialize an empty vector store
vectorstore = None

In [None]:
# Function for loading the embedding model
def load_embedding_model(model_path, normalize_embedding=True):
    return HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs={'device':'cpu'}, # here we will run the model with CPU only
        encode_kwargs = {
            'normalize_embeddings': normalize_embedding # keep True to compute cosine similarity
        }
    )

In [None]:
# Function for creating embeddings using FAISS
def create_embeddings(chunks, embedding_model, storing_path="vectorstore"):
    # Creating the embeddings using FAISS
    vectorstore = FAISS.from_texts(chunks, embedding_model)
    
    # Saving the model in current directory
    vectorstore.save_local(storing_path)
    
    # returning the vectorstore
    return vectorstore

In [72]:
def process_json_file(json_file):
    global vectorstore
    
    with open(json_file, 'r') as f:
        data = json.load(f)
        # Extract relevant text data
        title = data.get('Title', '')
        authors = ', '.join(data.get('Authors', []))
        abstract = data.get('Abstract', '')
        keywords = ', '.join(data.get('Keywords', []))
        pubdate = data.get('Pub Date', '')
        # Concatenate text data
        text = f"{abstract} "
        meta = f"Title: {title} | Authors: {authors} | Keywords: {keywords} | Pubdate:{pubdate}"
        # Split the text into documents
        docs = text_splitter.split_text(text)
        
        # Load or create the embedding model
        embedding_model = load_embedding_model(model_path='allenai-specter') #specialised for scientific research
        
        # If vectorstore is None, initialize it with the first set of embeddings
        if vectorstore is None:
            vectorstore = create_embeddings(docs, embedding_model)
        else:
            # Update the vectorstore with new embeddings
            vectorstore.add_texts(docs, metadatas=[{}] * len(docs), embeddings_model=embedding_model)
            
        # Save the updated vectorstore
        vectorstore.save_local("vectorstore")

In [73]:
# Assuming json_files is a list containing paths to JSON files
for json_file in json_files:
    process_json_file(json_file)