In [1]:
import pandas as pd
from pprint import pprint 
import json
import openai
import re
import os
import pinecone
import time


import nltk
from nltk.tokenize import word_tokenize


import langchain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import SystemMessage, HumanMessage, AIMessage


from dotenv import load_dotenv, find_dotenv

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv(find_dotenv())

True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")

In [4]:
file_path = "./Data/jira-conversations2.json"

with open(file_path, "r", encoding='utf-8') as f:
    data = json.load(f)

In [5]:
json_data = data[0:9]

In [6]:
def clean_text(text):
    text = ' '.join(text.split())  # Remove extra whitespaces and newlines
    
    # Remove different kinds of unwanted patterns from text
    # Remove unwanted patterns
    text = re.sub(r'~+\+~+', '', text)  # Remove ~+~+
    text = re.sub(r'\+\~', '', text)  # Remove +~
    text = re.sub(r'----', '', text)  # Remove ----
    text = re.sub(r'\+\+', '', text)    # Remove ++
    

    text = re.sub(r'~accountid:[a-zA-Z0-9]+', '', text)
    text = re.sub(r'\{"type".*?\}\]', '', text)
    text = re.sub(r'\{adf\}.*?\{adf\}', '', text)  # Remove "{adf} ... {adf}" and content within
    text = re.sub(r'\[~~email~~-?\d+~~\]', '', text)  # Remove `[~~email~~-numbers~~]`
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    text = re.sub(r'<\[\[~~email~~-?\d+~~\]\]', '', text)  # Remove `<[[~~email~~-numbers~~]]>`
    text = re.sub(r'\|mailto:\[~~email~~-?\d+~~\]', '', text)  # Remove email tags
    text = re.sub(r'—-—-—-—', '', text)  # Remove "—-—-—-—"
    text = re.sub(r'\w{3}, \d{1,2} \w{3} \d{4}, \d{1,2}:\d{2} [apmAPM]{2}', '', text)  # Remove timestamps
    text = re.sub(r'\|\s+\|\s+You don\'t often get email from', '', text)  # Remove headers
    text = re.sub(r'\[Powered by Service Management.*?\]', '', text)  # Remove "[Powered by...]"
    text = re.sub(r'\[View request.*?&reserved=0\]', '', text)  # Remove "[View request...]"
    text = re.sub(r'\*\*\*Please reply above this line\*\*\*', '', text)  # Remove reply line
    text = re.sub(r'\|', '', text)  # Remove "|"
    text = re.sub(r'_', '', text)  # Remove "_"
    text = re.sub(r'\[mailto:\]', '', text)  # Remove "[mailto:]"
    text = re.sub(r'\[|\]', '', text)  # Remove "[" and "]"
    text = re.sub(r'<|>', '', text)  # Remove "<" and ">"
    text = re.sub(r'\*', '', text)  # Remove "*"
    text = re.sub(r'!jira[-a-zA-Z0-9 ()]+!', '', text)  # Remove Jira text
    
    return text

# Clean text in conversations
for conv in json_data:
    for key, value in conv.items():
        if isinstance(value, str):
            try:
                conv[key] = clean_text(value)
            except Exception as e:
                print(f"Error cleaning text in '{key}': {e}")
        else:
            print(f"Skipped cleaning text for key '{key}' as it's not a string.")

In [7]:
texts = [', '.join(f"'{k}': '{v}'" for k, v in item.items()) for item in json_data]

In [8]:
texts

["'question000001': 'Hii This is  gadipally UID-U6331114. In process of submitting DS160. I need to send address and phone number of point of contact(school official).So can you please send address and phone number of IRIS BRITO(School official to contact uon arrival).Can i also know the first and last name of IRIS BRITO. Thank you. ', 'response000002': 'Thank you for reaching out. You are able to add  at . Best, '",
 "'question000001': 'Hello, I am  with ID - U20309912. I have not received any mail regarding academic integrity course . Can you please send the link for academic integrity course and details regarding it?! Thanks in advance. ', 'response000002': 'Hello, provide me with your USF email.', 'question000003': 'Hello, USF mail Id is  Thanks in advance   ', 'response000004': 'Canvas invite has been sent via email.', 'question000005': 'Hello , I didn’t get any canvas invite to my mail. Can you please send it again?! Thanks in advance   ', 'response000006': 'Invite has been sent 

In [9]:
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f"Document(page_content='{self.page_content}', metadata={self.metadata})"


# Define a simple text splitting function
def split_text(text, chunk_size=1000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        chunks.append(chunk)
    return chunks    


# Instantiate RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Assuming the 'texts' list from previous code
documents_list = [Document(page_content=text, metadata={'text': text}) for text in texts]

In [10]:
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

In [11]:
if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        PINECONE_INDEX_NAME,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(PINECONE_INDEX_NAME).status['ready']:
        time.sleep(1)

index = pinecone.Index(PINECONE_INDEX_NAME)

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00024,
 'namespaces': {'': {'vector_count': 24}},
 'total_vector_count': 24}

In [13]:
# pinecone.delete_index("langchain-index")

In [14]:
pinecone.create_index("langchain-index", dimension=1536) 

In [15]:
index_name = "langchain-index"

In [16]:
embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002") #EXPENSIVE - - - USE CAREFULLY

In [17]:
print(f"Total number of Documents is: {len(documents_list)}")

for i in range(len(documents_list)):
    try:
        document = documents_list[i]
        print(f"Processing document {i + 1}")

        # Split the document content using the split_text function
        chunks = split_text(document.page_content)

        for chunk in chunks:
            chunk_document = Document(page_content=chunk, metadata={'text': chunk})
            search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 9
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Processing document 9


In [18]:
# Do a simple vector similarity search
query = "I am having a doubt"

result = search.similarity_search(query)
result

[]

In [20]:
# text_field = "text"  # the metadata field that contains our text

# # Initialize the vector store object
# vectorstore = Pinecone(
#     index, embeddings.embed_query, text_field
# )



In [21]:
# scrapped_pages_dir = "./Data/Scrapped Pages"

In [22]:
# # Traverse the directory and get all .txt files
# txt_files = [f for f in os.listdir(scrapped_pages_dir) if f.endswith('.txt')]

In [23]:
# # Read the contents of each file and store in a list
# documents_list = []
# for file in txt_files:
#     with open(os.path.join(scrapped_pages_dir, file), 'r', encoding='utf-8') as f:
#         content = f.read()
#         documents_list.append(Document(page_content=content, metadata={'text': content}))

In [24]:
# print(f"Total number of Documents is: {len(documents_list)}")

# for i in range(len(documents_list)):
#     try:
#         document = documents_list[i]
#         print(f"Processing document {i + 1}")

#         # Split the document into chunks
#         chunks = split_text(document.page_content)

#         for chunk in chunks:
#             chunk_document = Document(page_content=chunk, metadata={'text': chunk})
#             # Vectorize the chunk
#             search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

#     except Exception as e:
#         print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 374
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
Processing document 15
Processing document 16
Processing document 17
Processing document 18
Processing document 19
Processing document 20
Processing document 21
Processing document 22
Processing document 23
Processing document 24
Processing document 25
Processing document 26
Processing document 27
Processing document 28
Processing document 29
Processing document 30
Processing document 31
Processing document 32
Processing document 33
Processing document 34
Processing document 35
Processing document 36
Processing document 37
Processing document 38
Processing document 39
Processing document 40
Processing document 41
Processing document 42
Processin

Processing document 346
Processing document 347
Processing document 348
Processing document 349
Processing document 350
Processing document 351
Processing document 352
Processing document 353
Processing document 354
Processing document 355
Processing document 356
Processing document 357
Processing document 358
Processing document 359
Processing document 360
Processing document 361
Processing document 362
Processing document 363
Processing document 364
Processing document 365
Processing document 366
Processing document 367
Processing document 368
Processing document 369
Processing document 370
Processing document 371
Processing document 372
Processing document 373
Processing document 374


In [25]:
# # Initialize the vector store object
# vectorstore = Pinecone(
#     index, embeddings.embed_query, text_field
# )

# query = "Michelle Jahn"
# vectorstore.similarity_search(query, k=3)

[Document(page_content="Michelle JahnResearch Administratormmjahn@usf.eduCampus:\xa0TampaRoom:\xa0BSN 3111Phone:\xa0813-974-1512Michelle Jahn is the Muma College of Business research administrator, a role in which\n               she will assist faculty in ferreting out opportunities for funded research, the preparation\n               and submission of competitive proposals and the management of awarded grants.She currently is enrolled in the MBA program with the Muma College of Business and\n               received a bachelor's degree in marketing from USF in 2005. She has been with USF\n               since 2007 and has served as a fiscal and business specialist, a research administrator\n               and administrative specialist with USF Research and Innovation."),
 Document(page_content='ffice Unit Research Administrator:\nMichelle Jahn\nmmjahn@usf.edu\n813-974-1512\nTampa Campus\nBSN 3111\nStaff Spotlight: Michelle Jahn is a one-person team helping faculty reach their research

In [26]:
# text_folder_path = "./Data/Syllabus 2023-selected/Text"

# txt_files = [f for f in os.listdir(text_folder_path) if f.endswith('.txt')]

# # Read the contents of each file and store in a list
# documents_list = []
# for file in txt_files:
#     with open(os.path.join(text_folder_path, file), 'r', encoding='utf-8') as f:
#         content = f.read()
#         documents_list.append(Document(page_content=content, metadata={'text': content}))

In [27]:
# print(f"Total number of Documents is: {len(documents_list)}")

# for i in range(len(documents_list)):
#     try:
#         document = documents_list[i]
#         print(f"Processing document {i + 1}")

#         # Split the document into chunks using the split_text function
#         chunks = split_text(document.page_content)

#         for chunk in chunks:
#             chunk_document = Document(page_content=chunk, metadata={'text': chunk})
#             # Vectorize the chunk using Pinecone
#             search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

#     except Exception as e:
#         print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 282
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Error processing document 8: (500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'Content-Length': '150', 'date': 'Sun, 22 Oct 2023 20:43:35 GMT', 'x-envoy-upstream-service-time': '34', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":13,"message":"We were unable to process your request. If the problem persists, please contact us at https://support.pinecone.io","details":[]}

Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
Processing document 15
Processing document 16
Processing document 17
Processing document 18
Processing document 19
Processing document

In [None]:
# vectorstore = Pinecone(
#     index, embeddings.embed_query, text_field
# )

# query = "USF Student Conduct Code"
# sample = vectorstore.similarity_search(query, k=3)
# sample