### Installation

In [1]:
!pip install langchain
!pip install unstructured
!pip install openai
!pip install chromadb
!pip install Cython
!pip install tiktoken

Collecting langchain
  Downloading langchain-0.0.265-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting langsmith<0.1.0,>=0.0.11 (from langchain)
  Downloading langsmith-0.0.22-py3-none-any.whl (32 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic<2,>=1 (from langchain)
  Downloading pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasse

In [2]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.16.3


In [13]:
!pip install pdfminer-six

Collecting pdfminer-six
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer-six
Successfully installed pdfminer-six-20221105


### Load Required Packages

In [14]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

### OpenAI API Key

In [4]:
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
os.environ["OPENAI_API_KEY"] = "<your-own-api-key>"

### Connect Google Drive

In [5]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"

Mounted at /content/gdrive


In [6]:
pdf_folder_path = f'{root_dir}/Colab Notebooks/Thesis/'
os.listdir(pdf_folder_path)

['5033458416.pdf',
 'TheHappyFarmer.pdf',
 'IPCC_AR6_SYR_LongerReport.pdf',
 '219089_f3a09c11-5f21-4e5d-bcff-58c7235a7186.pdf',
 '1709_00029.pdf',
 '2304_00116.pdf',
 'AIComputingEmitsCO2.pdf',
 'AllIslandClimateandBiodiversityResearchNetworkFinal.pdf',
 'nationalLandCoverMap_v3_2.pdf',
 's10113_021_01798_8.pdf']

### Load Multiple PDF files

In [7]:
# location of the pdf file/files.
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]

In [7]:
loaders

[<langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x79191509a590>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7919153217e0>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x791915321810>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x79191509ac80>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x79191509a620>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x791914ee4220>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x791914ee4280>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x791914ee42e0>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x791914ee4340>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x791914ee43a0>]

### Vector Store
Chroma as vectorstore to index and search embeddings


There are three main steps going on after the documents are loaded:

- Splitting documents into chunks

- Creating embeddings for each document

- Storing documents and embeddings in a vectorstore


In [15]:
index = VectorstoreIndexCreator().from_loaders(loaders)
index

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7a07e0d43a30>)

In [16]:
index.query('What is the impact of climate change on biodiversity?')

' Climate change has caused substantial damages and increasingly irreversible losses in terrestrial, freshwater, cryospheric and coastal and open ocean ecosystems. Approximately half of the species assessed globally have shifted polewards or, on land, also to higher elevations. Biological responses including changes in geographic placement and shifting seasonal timing are often not sufficient to cope with recent climate change. Hundreds of local losses of species have been driven by increases in the magnitude of heat extremes and mass mortality events on land and in the ocean. Impacts on some ecosystems are approaching irreversibility such as the impacts of hydrological changes resulting from the retreat of glaciers, or the changes in some mountain ecosystems. The likelihood of abrupt and irreversible changes and their impacts increase with higher global warming levels. As warming levels increase, so do the risks of species extinction or irreversible loss of biodiversity in ecosystems 

In [17]:
index.query('how many Level 2 categories are in the national land cover map of ireland?')

' There are 36 Level 2 categories in the National Land Cover Map of Ireland.'

In [18]:
index.query('What makes a happy farmer?')

' Factors that make a happy farmer include social interaction with other farmers, farm work, and nonpecuniary benefits from farming.'

In [19]:
index.query('Why does a farmer work?')

' A farmer works to produce food and other agricultural products, to manage their land and resources, and to engage in social interaction with other farmers.'

In [20]:
index.query('What are other names for hedgerows?')

' Other names for hedgerows include shelterbelts, windbreaks, bocage (in France), and Knick (in Germany).'

In [21]:
index.query_with_sources('What are other names for hedgerows??')

{'question': 'What are other names for hedgerows??',
 'answer': ' Other names for hedgerows include shelterbelts, windbreaks, bocage, Knick, and live fences.\n',
 'sources': '/content/gdrive/My Drive//Colab Notebooks/Thesis/s10113_021_01798_8.pdf'}

## Disclaimer:
Note: OpenAI provides a free API key for initial testing. Once you move to a paid subscription, calling the API in the way demonstrated in this example will incur monetary charges. Refer to OpenAI's pricing information for details.

Be aware that information, such as files to train OpenAI's LLM can become public if applied in the way this demo demonstrates. Refer to OpenAI's usage policy for details.

Do not use for actual tax filing purposes. This demo is for educational purposes only and for demonstrating machine learning methods. The author makes no claims that the outcomes shown here or any outcomes that could be produced by this method are accurate or reliable.