In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
import time
import datetime
import json
import re
from tqdm.notebook import tqdm
import pickle
import numpy as np

from PyPDF2 import PdfReader
import pinecone
import openai

openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

embedding_dimensions = 1536
model_engine = "text-embedding-ada-002"
pinecone_name = "dnd-rules-lawyer"
pinecone_region = "asia-southeast1-gcp" # Pinecone calls this the environement ? strange
pinecone_namespace = "rules"

pinecone.init(api_key=pinecone_api_key, environment=pinecone_region)
index = pinecone.Index(pinecone_name)
openai.api_key = openai_api_key

In [None]:
with open('rules_list.pickle', 'rb') as f:
    rules_list = pickle.load(f)

In [None]:
unique_rules = list()
for rule in rules_list:
    if rule not in unique_rules:
        unique_rules.append(rule)
rules_list = unique_rules

In [None]:
for rule in rules_list:
    if "Error" in list(rule.keys()) or 'error' in list(rule.keys()):
        print(rule)
        del rules_list[rules_list.index(rule)]

In [None]:
len(rules_list)

In [None]:
def clean_rule(rule):
    
    matches = re.finditer(r'[\w]\.[\w]', rule)
    for match in matches:
        rule = re.sub(r'[\w]\.[\w]',
               rule[match.start()] + '.  ' + rule[match.end() - 1],
               rule)
    
    if rule.startswith('/'):
        rule = re.sub(r'[/]+', '', rule)
    
    return rule

In [None]:
for rule_index, rule_meta in enumerate(rules_list):
    try:
        rule_meta['rule text'] = clean_rule(rule_meta['rule text'])
    except Exception as e:
        print(rule_index)
        print(rule_meta)

In [None]:
rules_list[14]

In [None]:
length_of_rule = 0
longest_rule = None
for rule in rules_list:
    rule_text = rule['rule text']
    if len(rule_text) > length_of_rule:
        longest_rule = rule
        length_of_rule = len(rule_text)

In [None]:
longest_rule

In [None]:
def get_openai_embeddings(content, engine="text-embedding-ada-002"):
    content = content.encode(encoding="ASCII", errors="ignore").decode()  # fix unicode errors
    response = openai.Embedding.create(input=content, engine=engine)
    vector = response['data'][0]['embedding']
    return vector

In [None]:
def craft_content_from_rule(rule):
    content = f'TITLE: {rule["title"]}'
    tree_links = []
    
    for link in rule['tree_links']:
        link = list(link.keys())[0]
        match = re.search(r'pg\.', link)
        if match is None:
            tree_links.append(link)
        else:
            tree_links.append(link)
            break
    for index, link in enumerate(tree_links):
        if index == len(tree_links) - 1:
            content += f'\nCORE RULEBOOK PAGE: {link}'
        else:
            content += f'\nPARENT TITLE: {link}'
    
    content += f'\nRULE: {rule["rule text"]}'
    return content

In [None]:
for rule in tqdm(rules_list):
    content = craft_content_from_rule(rule)
    vector = get_openai_embeddings(content)
    rule['vector'] = vector
    rule['content'] = content

In [None]:
with open('rules_list.pickle', 'wb') as f:
    pickle.dump(rules_list, f)

In [None]:
from typing import Iterator
class PineconeIterator:
    def __init__(self, rules_list):
        self.rules_list = rules_list
        self.batch_size = 100
        
    def splits_num(self, elements: int) -> int:
        return round(elements / self.batch_size)
        
    def get_iterator(self) -> Iterator[pd.DataFrame]:
        splits = self.splits_num(len(self.rules_list) + 1)
        if splits <= 1:
            yield self.rules_list
        else:
            for chunk in np.array_split(self.rules_list, splits):
                yield chunk
                
def generate_pinecone_rule_upsert(rule):
    rule_dict = {}
    rule_dict["id"] = str(hash(rule['rule text']))
    rule_dict["values"] = rule['vector']
    metadata = {}
    for link in rule['tree_links']:
        link = list(link.keys())[0]
        links = re.split(r'pg\.', link)
        if match is not None and len(links) == 2:
            metadata['book'] = links[0]
            metadata['page'] = links[1]
    metadata['title'] = rule['title']
    metadata['url'] = rule['rule_url']
    rule_dict["metadata"] = metadata
    return rule_dict

In [None]:
len(rules_list)

In [None]:
p = PineconeIterator(rules_list)
for rules in tqdm(p.get_iterator()):
    rules_upsert = []
    for rule in rules:
        rules_upsert.append(generate_pinecone_rule_upsert(rule))
    index.upsert(vectors=rules_upsert, namespace=pinecone_namespace)