# ChatGPT - Custom Knowledge Base

### Installation

In [None]:
%%capture
!pip install langchain
!pip install openai
!pip install chromadb
# !pip install Cython
!pip install tiktoken
!pip install pypdf
!pip install unstructured
!pip install PyPDF2
!pip install pdf2image

### Load Required Packages

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.llms.openai import OpenAI

### OpenAI API Key

In [None]:
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
os.environ["OPENAI_API_KEY"] = "KEY"

### Connect Google Drive

In [None]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/MyDrive/"

Mounted at /content/gdrive


### Load Multiple PDF files

In [None]:
pdf_folder_path = '/content/gdrive/MyDrive/data/pdfs/'
webpages_folder_path = '/content/gdrive/MyDrive/data/webpages/'
os.listdir(pdf_folder_path)

['coming-to-ucsd-guide.pdf', 'ispo-welcome-guide.pdf']

In [None]:
from langchain.document_loaders import UnstructuredFileLoader

In [None]:
# location of the pdf file/files.

pdf_loaders = [PyPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
txt_loaders = [UnstructuredFileLoader(os.path.join(webpages_folder_path, fn)) for fn in os.listdir(webpages_folder_path)]

In [None]:
loaders = txt_loaders
# loaders = pdf_loaders

### Vector Store
Chroma as vectorstore to index and search embeddings


There are three main steps going on after the documents are loaded:

- Splitting documents into chunks

- Creating embeddings for each document

- Storing documents and embeddings in a vectorstore


In [None]:
# location of the pdf file/files.
index = VectorstoreIndexCreator().from_loaders(loaders)
index

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7fec38680fd0>)

In [None]:
import pandas as pd

In [None]:
# Q = pd.read_csv('/content/Questions - Sheet1.csv')
# questions = list(Q['Question'])

In [None]:
answers = []
for q in questions:
    response = index.query_with_sources(q, llm=OpenAI(temperature=0.3))
    answers.append(response['answer'])

In [None]:
answers

[' To apply for a social security number, you can call customer service at 800-375-5283 or make an InfoPass appointment with USCIS.\n',
 ' Yes, you need a job letter to get a social security number.\n',
 ' To know if you are eligible to get an SSN, contact the Social Security Administration.\n',
 ' The processing time for an F-1 CPT application is 10 business days.\n',
 ' The International Students & Programs Office (ISPO) requires proof of funding for the standardized estimated rent costs for all F-1/J-1 students when requesting program extensions.\n',
 ' The tuition for the MAS Climate Science and Policy at SIO is $37,930.\n',
 ' No, President Biden has not lifted restrictions for fully vaccinated travellers from Southern Africa as yet. \n',
 ' The average cost of living in San Diego is not specified in the given sources.\n',
 ' Documents needed for a visa application include a valid passport, I-94 Non-immigrant Arrival/Departure Record, I-20 or DS-2019 form, and Employment Authoriza

In [None]:
Q['Chat-GPT: knowledge base - webpages'] = answers
Q

Unnamed: 0,Number,Question,GPT Website,GPT PDF,Chat-GPT: knowledge base,Model Answer,Chat-GPT: knowledge base - pdfs,Chat-GPT: knowledge base - webpages
0,1,How can I apply for a social security number?,### You can apply for a social security numbe...,### You can apply for a social security numbe...,,- Submit a request for a support letter at fro...,"To apply for a social security number, you ne...","To apply for a social security number, you ca..."
1,2,Do I need a job letter to get a social securit...,"### No, you do not need a job letter to get a...","No, you do not need a job letter to get a soci...",,Yes,"Yes, you need a job letter to get a social se...","Yes, you need a job letter to get a social se..."
2,3,How do I know if I am eligible to get an SSN?,### You must meet the eligibility requirement...,### You can check if you are eligible to get ...,,- A Social Security Number (SSN) is a 9-digit ...,"To be eligible to get an SSN, you need to hav...","To know if you are eligible to get an SSN, co..."
3,4,What's the processing time for an F-1 CPT appl...,### The processing time for an F-1 CPT applic...,### The F-1 CPT application process can take ...,,,I don't know.\n,The processing time for an F-1 CPT applicatio...
4,5,How much funding do you need to show when ente...,### You need to show at least $500 to enter t...,### You need to show at least $30 US dollars ...,,,"You need to have a valid I-20 Form, I-94 Reco...",The International Students & Programs Office ...
5,6,What's the tuition for for the MAS Climate Sci...,### The tuition for for the MAS Climate Scien...,### The tuition for the MAS Climate Science a...,,,I don't know.\n,The tuition for the MAS Climate Science and P...
6,7,Has President Biden lifted restrictions for fu...,"### No, President Biden has not yet lifted re...","### No, President Biden has not yet lifted re...",,,I don't know.\n,"No, President Biden has not lifted restrictio..."
7,8,What is the average cost of living in SD?,"### The average cost of living in SD is $29,0...","### The average cost of living in SD is $4,50...",,,I don't know.\n,The average cost of living in San Diego is no...
8,9,What documents are needed for a visa application?,"### A valid passport, I-94 Non-immigrant Arri...",### A passport and two forms of government-is...,,,Documents needed for a visa application inclu...,Documents needed for a visa application inclu...
9,10,What should I pack when coming to SD?,### You should pack your clothing in Ziploc b...,"### Coming to SD? You should pack light, as m...",,,"When coming to SD, you should pack a towel, s...","When coming to San Diego, it is recommended t..."


In [None]:
Q.to_csv('/content/questions.csv')

# For individual questions

In [None]:
response = index.query_with_sources('What is the purpose of the Analytical Writing Program?', llm=OpenAI(temperature=0.3))
response

{'question': 'What is the purpose of the Analytical Writing Program?',
 'answer': ' The purpose of the Analytical Writing Program is to help students master critical thinking, reading, and writing skills, and to provide courses that fulfill the UC Entry Level Writing Requirement (ELWR).\n',
 'sources': '/content/gdrive/MyDrive/data/webpages/ispo.ucsd.edu_campus-partners_advising-international-students_iresource.html#main-content.txt, /content/gdrive/MyDrive/data/webpages/ispo.ucsd.edu_advising_academic-english.html#main-content.txt'}

In [None]:
print(response['answer'])

 The purpose of the Analytical Writing Program is to help students master critical thinking, reading, and writing skills, and to provide courses that fulfill the UC Entry Level Writing Requirement (ELWR).



In [None]:
for x in response['sources'].split(', '):
    if 'html' in x:
        x = x.replace('/content/gdrive/MyDrive/data/webpages/', 'https://')
        x = x.replace('.txt', '')
        x = x.replace('_', '/')
    print(x)

https://ispo.ucsd.edu/campus-partners/advising-international-students/iresource.html#main-content
https://ispo.ucsd.edu/advising/academic-english.html#main-content
