In [2]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [3]:
from langchain.document_loaders import JSONLoader

### Data Process and Save

In [4]:
import json
import os

def combine_json_files(folder_path):
    combined_data = []  # Use a list to combine data; adjust as needed (e.g., use a dict)

    # Loop through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):  # Check if the file is a JSON file
            file_path = os.path.join(folder_path, filename)

            # Open and read the JSON file
            with open(file_path, 'r') as file:
                data = json.load(file)
                combined_data.append(data)  # Append the data to the combined list

    return combined_data

# # Specify the path to your folder containing JSON files
# folder_path = 'data'

# # Combine JSON files
# combined_json_data = combine_json_files(folder_path)

In [15]:
def read_json_file(folder_path):
    final_data = []  # Use a list to combine data; adjust as needed (e.g., use a dict)
    with open(folder_path, 'r') as file:
        data = json.load(file)
        final_data.append(data) 
    return final_data    

folder_path = 'processed_data/data.json'
json_data = read_json_file(folder_path)

In [16]:
def flatten(l):
    return [item for sublist in l for item in sublist]
data_flatten=flatten(json_data)
data_flatten[0]

{'url': 'URL: http://acd.iupui.edu/majors-by-name/general-studies/index.html#search',
 'body': "  ## General studies   ### School of Liberal Arts  Indiana University offers a bachelor's degree in general studies. The program was designed for adult students who want a degree program that combines high academic standards with convenience.   #### Degree map  One of the most difficult tasks of being a college student is figuring out what classes to take and when to take them. In order to make the process a little easier, each major at IUPUI has its own degree map outlining the path you should take to graduate as quickly as possible.  View degree map  [View degree map](https://sisjee.iu.edu/sisigps-prd/web/igps/dm/public/)   #### Major-career connections  General studies   #### Get admitted to the School of Liberal Arts   ##### Admission by certification  General studies—application link for current IUPUI\xa0students  [General studies—application link for current IUPUI\xa0students](https://

In [18]:
def save_cleaned_data(data, output_file):
    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4)
output_file = 'document_data.json'

In [19]:
save_cleaned_data(data_flatten,output_file)

### Document Creation

In [21]:
def metadata_func(record:dict, metadata:dict) ->dict:
  metadata["url"]=record.get('url')
  metadata['context']=record.get('context')
  return metadata
  

In [22]:
loader = JSONLoader(
    file_path=output_file,
    jq_schema='.[]',
    content_key='body',
    metadata_func=metadata_func,
    text_content=False
)

In [23]:
documents = loader.load()
documents[0]

Document(page_content="  ## General studies   ### School of Liberal Arts  Indiana University offers a bachelor's degree in general studies. The program was designed for adult students who want a degree program that combines high academic standards with convenience.   #### Degree map  One of the most difficult tasks of being a college student is figuring out what classes to take and when to take them. In order to make the process a little easier, each major at IUPUI has its own degree map outlining the path you should take to graduate as quickly as possible.  View degree map  [View degree map](https://sisjee.iu.edu/sisigps-prd/web/igps/dm/public/)   #### Major-career connections  General studies   #### Get admitted to the School of Liberal Arts   ##### Admission by certification  General studies—application link for current IUPUI\xa0students  [General studies—application link for current IUPUI\xa0students](https://liberalarts.iupui.edu/genstudies/pages/admissions-folder/index.php)   ###

#### Splitting the docs

In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [25]:
len(texts)

143368

### Embedding Creation


In [26]:
#token=''

In [27]:
import os
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"

# get openai api key from platform.openai.com
OPENAI_API_KEY = token

OAIembeddings = OpenAIEmbeddings(
    model=model_name, openai_api_key=OPENAI_API_KEY, disallowed_special=()
)

### Vector Store

In [28]:
sample = texts[:5]

In [None]:
vector_store = Chroma.from_documents(
    sample, 
    OAIembeddings, 
    ids=[f"{item.metadata['url']}-{index}" for index, item in enumerate(sample)], 
    persist_directory="iupui_store_1"
    )