In [2]:
import os
import json
import string

from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, SpacyTextSplitter,\
TokenTextSplitter, CharacterTextSplitter
from transformers import GPT2TokenizerFast
from langchain.embeddings.openai import OpenAIEmbeddings
import streamlit as st
from bs4 import BeautifulSoup
from src.ScopusRetriever import ScopusRetriever
from typing import List

## Clean Court Cases

In [3]:
from striprtf.striprtf import rtf_to_text

data_dir = "./data/raw2"
save_dir = "./data/backgrounds2"

fname = list(os.listdir(data_dir))[4]
fpath = os.path.join(data_dir, fname)

for fname in os.listdir(data_dir):
    fpath = os.path.join(data_dir, fname)
    # read the rtf
    with open(fpath, 'r+', encoding='utf-8') as f:
        content = f.read()
        text = rtf_to_text(content)

    # save to txt
    start_idx = text.lower().find("core terms\n\n") + 12
    end_idx = text.find("conclusion")
    save_path = os.path.join(save_dir, fname[:-4]+".txt")
    with open(save_path, 'w+', encoding='utf-8') as f_write:
        f_write.write(text[start_idx:end_idx])

    print(f"Read and wrote {fname}")

Read and wrote A.K. v. A.K., 2020 Del. Fam. Ct. LEXIS 29.rtf
Read and wrote Acushnet Co. v. Amoco Oil Co., 1998 Mass. Super. LEXIS 325.rtf
Read and wrote Admiral Ins. Co. v. Fire-Dex, LLC, 2022 U.S. Dist. LEXIS 198034.rtf
Read and wrote All Is. Credit Corp. v Popular Brokerage Corp., 2020 N.Y. Misc. LEXIS 3589.rtf
Read and wrote All Island Credit Corp. v. Popular Brokerage Corp., Popular Brokerage Corp., J.J. Farber-Lottman Co.rtf
Read and wrote Altland v. Diehl, 2022 Pa. Super. Unpub. LEXIS 548.rtf
Read and wrote Am. Apparel & Footwear Ass'n v. Allen, 608 F. Supp. 3d 1005.rtf
Read and wrote American Petroleum Institute v. New Jersey Dept. of Environmental Protection, 230 N.J. Super. 563.rtf
Read and wrote Amplex Mfg. Co. v. A.B.C. Plastic Fabricators, Inc., 184 F. Supp. 285.rtf
Read and wrote Anderson v. Ga. Gulf Lake Charles, LLC, 2008 U.S. Dist. LEXIS 136412.rtf
Read and wrote Andrews v. P&G, 2019 U.S. Dist. LEXIS 211567.rtf
Read and wrote Anthony Ferreiro v. Elite Home Prods., 2020

In [4]:
import re
import string

for fname in os.listdir(save_dir):
    fpath = os.path.join(save_dir, fname)
    # read the txt
    with open(fpath, 'r+', encoding='utf-8') as f:
        content = f.read()

    # remove all non-alphanumeric
    alnum_s = ''.join(filter(lambda x: str.isalnum(x) or x==" " or x=="\n" or x in string.punctuation,
                             content))
    # remove all square brakets
    cleaned_s = re.sub("\[\s*.*\s*\]\s*", "",alnum_s)

    if "background" in cleaned_s.lower():
        idx = cleaned_s.lower().find("background")
        cleaned_s = cleaned_s[idx+len("background")+1:]

    # write back in
    save_fpath = os.path.join('./data/cleaned2', fname)
    with open(save_fpath, 'w+', encoding='utf-8') as f:
        f.write(cleaned_s)

In [13]:
def load_docs(directory:str)-> List:
  """
  Creates a data loader object that generates documents from [directory]
  :@param directory: dir to load documents from. Must be a valid dir
  """
  # check for file existance
  assert os.path.isdir(directory), f"{directory} not found."

  # create a generator object to load documents
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs('./data/cleaned')
print(f"number of documents:{len(documents)}")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KAI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


number of documents:39


In [14]:
class UnknownSplitterType(Exception):
  pass

def split_docs(documents:List,chunk_size:int=500,chunk_overlap:int=100,
               splitter_type:str='RecurChar')->List:
  """
  Splits the documents into chunks of [chunk_size] with an overlap of
  [chunk_overlap] between adjacent chunks. Splitting regime is base on
  [splitter_type].
  :@param splitter_type: one of 'RecurChar' (RecursiveCharacterTextSplitter),
  'Spacy' (SpacyTextSplitter), 'TikToken' (TokenTextSplitter),
  'GPT2TokenizerFast' (uses Hugging Face's GPT2TokenizerFast)
  """
  match splitter_type:
    case 'RecurChar':
          text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                         chunk_overlap=chunk_overlap)
          print(f"Loaded RecursiveChatacterTextSplitter")

    case 'Spacy':
          text_splitter = SpacyTextSplitter(chunk_size=chunk_size,
                                            chunk_overlap=chunk_overlap)
          print(f"Loaded SpacyTextSplitter")

    case 'TikToken':
          text_splitter = TokenTextSplitter(chunk_size=chunk_size,
                                            chunk_overlap=chunk_overlap)
          print(f"Loaded TikToken")

    case 'GPT2TokenizerFast':
          tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
          text_splitter = \
          CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,
                                                           chunk_size=chunk_size,
                                                           chunk_overlap=chunk_overlap)
          print(f"Loaded GPT2TokenizerFast")

    case _:
      raise UnknownSplitterType("needs to be one of {'RecurChar', 'Spacy','TikToken','GPT2TokenizerFast'}")

  docs = text_splitter.split_documents(documents)
  return docs


docs = split_docs(documents,chunk_size=200, chunk_overlap=100,splitter_type='Spacy')
print(f"Number of chunks: {len(docs)}")



Loaded SpacyTextSplitter




Number of chunks: 916


In [15]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

class UnknownEmbeddingModelType(Exception):
  pass

def get_embeddings(embedding_model:str, api_key:str="")->str:
  """
  Creates an embedder that would generate the embeddings of the query based on
  the specified model
  :@param api_key: API key used to query the end point
  :@param embedding_model: model name needs to be one of
    'OPENAI': Uses OpenAI Embeddings
  """
  match embedding_model:
    case 'OPENAI':
      assert api_key != "", "OPENAI API key must not be an empty string"
      embeddings = OpenAIEmbeddings(openai_api_key=api_key)

    case 'sentence_transformers':
      embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    case _ :
      raise UnknownSplitterType("needs to be one of {'OPENAI', sentence_transformers}")

  return embeddings

embeddings = get_embeddings('sentence_transformers')
query_result = embeddings.embed_query("Test")
print(f"embedding length: {len(query_result)}")

2023-08-03 14:17:28.227 INFO    sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

2023-08-03 14:17:43.357 INFO    sentence_transformers.SentenceTransformer: Use pytorch device: cpu


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

embedding length: 384


In [17]:
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key= "232e5f68-2d67-4bc2-9ab4-6dd3855f6e49",
    environment="us-west4-gcp"
)

index = Pinecone.from_documents(docs, embeddings, index_name="plastic-cases-index")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]