 Libraries
  - langchain
  - openai
  - tqdm: library to show the progress of an action (downloading, training, ...)
  - jq: lightweight and flexible JSON processor
  - unstructured: A library that prepares raw documents for downstream ML tasks
  - pypdf: A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files
  - tiktoken: a fast open-source tokenizer by OpenAI.

In [None]:
!pip install langchain openai tqdm jq unstructured pypdf tiktoken

Collecting langchain
  Downloading langchain-0.0.348-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.3.8-py3-none-any.whl (221 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.5/221.5 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting jq
  Downloading jq-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (656 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m656.0/656.0 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unstructured
  Downloading unstructured-0.11.2-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf
  Downloading pypdf-3.17.2-py3-none-any.whl (277 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.9/277.9 kB[0m [31m2

In [None]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

# Loading Documents

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader

from langchain.document_loaders import DirectoryLoader

from langchain.document_loaders import UnstructuredHTMLLoader

from langchain.document_loaders import JSONLoader

from langchain.document_loaders import UnstructuredMarkdownLoader

from langchain.document_loaders import PyPDFLoader

# CSV Loader

In [None]:
loader = CSVLoader(file_path='/content/cities.csv')
data = loader.load()

In [None]:
print(data)

[Document(page_content='station_id: 41515\ncity_name: Asadabad\ncountry: Afghanistan\nstate: Kunar\niso2: AF\niso3: AFG\nlatitude: 34.8660000397\nlongitude: 71.1500045859', metadata={'source': '/content/cities.csv', 'row': 0}), Document(page_content='station_id: 38954\ncity_name: Fayzabad\ncountry: Afghanistan\nstate: Badakhshan\niso2: AF\niso3: AFG\nlatitude: 37.1297607616\nlongitude: 70.5792471913', metadata={'source': '/content/cities.csv', 'row': 1}), Document(page_content='station_id: 41560\ncity_name: Jalalabad\ncountry: Afghanistan\nstate: Nangarhar\niso2: AF\niso3: AFG\nlatitude: 34.4415269155\nlongitude: 70.4361034738', metadata={'source': '/content/cities.csv', 'row': 2}), Document(page_content='station_id: 38947\ncity_name: Kunduz\ncountry: Afghanistan\nstate: Kunduz\niso2: AF\niso3: AFG\nlatitude: 36.7279506623\nlongitude: 68.8725296619', metadata={'source': '/content/cities.csv', 'row': 3}), Document(page_content='station_id: 38987\ncity_name: Qala i Naw\ncountry: Afghanis

In [None]:
data

[Document(page_content='station_id: 41515\ncity_name: Asadabad\ncountry: Afghanistan\nstate: Kunar\niso2: AF\niso3: AFG\nlatitude: 34.8660000397\nlongitude: 71.1500045859', metadata={'source': '/content/cities.csv', 'row': 0}), Document(page_content='station_id: 38954\ncity_name: Fayzabad\ncountry: Afghanistan\nstate: Badakhshan\niso2: AF\niso3: AFG\nlatitude: 37.1297607616\nlongitude: 70.5792471913', metadata={'source': '/content/cities.csv', 'row': 1}), Document(page_content='station_id: 41560\ncity_name: Jalalabad\ncountry: Afghanistan\nstate: Nangarhar\niso2: AF\niso3: AFG\nlatitude: 34.4415269155\nlongitude: 70.4361034738', metadata={'source': '/content/cities.csv', 'row': 2}), Document(page_content='station_id: 38947\ncity_name: Kunduz\ncountry: Afghanistan\nstate: Kunduz\niso2: AF\niso3: AFG\nlatitude: 36.7279506623\nlongitude: 68.8725296619', metadata={'source': '/content/cities.csv', 'row': 3}), Document(page_content='station_id: 38987\ncity_name: Qala i Naw\ncountry: Afghanis

In [None]:
data[0]

Document(page_content='station_id: 41515\ncity_name: Asadabad\ncountry: Afghanistan\nstate: Kunar\niso2: AF\niso3: AFG\nlatitude: 34.8660000397\nlongitude: 71.1500045859', metadata={'source': '/content/cities.csv', 'row': 0})

In [None]:
data[0].page_content

'station_id: 41515\ncity_name: Asadabad\ncountry: Afghanistan\nstate: Kunar\niso2: AF\niso3: AFG\nlatitude: 34.8660000397\nlongitude: 71.1500045859'

In [None]:
data[0].metadata

{'source': '/content/cities.csv', 'row': 0}

# Same as pandas reader

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/cities.csv')

In [None]:
df.head(5)

Unnamed: 0,station_id,city_name,country,state,iso2,iso3,latitude,longitude
0,41515,Asadabad,Afghanistan,Kunar,AF,AFG,34.866,71.150005
1,38954,Fayzabad,Afghanistan,Badakhshan,AF,AFG,37.129761,70.579247
2,41560,Jalalabad,Afghanistan,Nangarhar,AF,AFG,34.441527,70.436103
3,38947,Kunduz,Afghanistan,Kunduz,AF,AFG,36.727951,68.87253
4,38987,Qala i Naw,Afghanistan,Badghis,AF,AFG,34.983,63.1333


# PDF Loader

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("book.pdf")

pages = loader.load_and_split()

In [None]:
pages[0]

Document(page_content=".ELEIIIENTS OF\nPROORA[ll[llI[|O\nI]{TERUIEUJSIN\nputhon'\nADilAil ATIZ\nTStlilO.HSIEilI LEE\nAMIT PRAKASH", metadata={'source': 'book.pdf', 'page': 0})

# Text Splitter

Text splitters work as following:

  1. Split the text up into small, semantically meaningful chunks (often sentences).
  2. Start combining these small chunks into a larger chunk until you reach a certain size (as measured by some function).
  3. Once you reach that size, make that chunk its own piece of text and then start creating a new chunk of text with some overlap (to keep context between chunks).

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# ["\n\n", "\n", " ", ""]

text_splitter_1 = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
    add_start_index = True,
)

In [None]:
pages_1 = loader.load_and_split(text_splitter=text_splitter_1)


In [None]:
# pages_1

In [None]:
pages_1[0]

Document(page_content=".ELEIIIENTS OF\nPROORA[ll[llI[|O\nI]{TERUIEUJSIN\nputhon'\nADilAil ATIZ\nTStlilO.HSIEilI LEE\nAMIT PRAKASH", metadata={'source': 'book.pdf', 'page': 0, 'start_index': 0})

In [None]:
len(pages_1[0].page_content)

100

In [None]:
from langchain.text_splitter import CharacterTextSplitter

In [None]:
# splits based on characters (by default "\n\n")

text_splitter_2 = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

In [None]:
pages_2 = loader.load_and_split(text_splitter=text_splitter_2)

In [None]:
pages_2[0]

Document(page_content=".ELEIIIENTS OF\nPROORA[ll[llI[|O\nI]{TERUIEUJSIN\nputhon'\nADilAil ATIZ\nTStlilO.HSIEilI LEE\nAMIT PRAKASH", metadata={'source': 'book.pdf', 'page': 0})

In [None]:
len(pages_2[0].page_content)

100

# Document loader

In [None]:
!pip install unstructured==0.7.12

Collecting unstructured==0.7.12
  Downloading unstructured-0.7.12-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting argilla (from unstructured==0.7.12)
  Downloading argilla-1.20.0-py3-none-any.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
Collecting msg-parser (from unstructured==0.7.12)
  Downloading msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting pdf2image (from unstructured==0.7.12)
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Collecting pdfminer.six (from unstructured==0.7.12)
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m76.1 MB/s[0m eta [36m0:

In [None]:
import os
os.mkdir('docs')

In [None]:
from langchain.document_loaders import DirectoryLoader

In [None]:
folder = 'docs'

In [None]:
loader = DirectoryLoader(folder)

In [None]:
docs = loader.load()

In [None]:
len(docs)

3

# Summarizer

https://python.langchain.com/docs/use_cases/summarization

Documents:
  https://python.langchain.com/docs/modules/chains/document/

1. Stuff
2. Refine
3. Map Reduce
4. Map re-rank

# Stuff
  It takes a list of documents, inserts them all into a prompt and passes that prompt to an LLM.

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI

In [None]:
llm = ChatOpenAI(
    openai_api_key = OPENAI_API_KEY,
    temperature=0,
    model_name="gpt-3.5-turbo-1106"
    )

In [None]:
chain = load_summarize_chain(
    llm,
    chain_type="stuff")

In [None]:
# chain.run(docs)

# Print the prompt template

In [None]:
print(chain.ll_chain.prompt.template)

In [None]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate

# Define prompt
prompt_template = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

# Define LLM chain
llm = ChatOpenAI(
    openai_api_key = OPENAI_API_KEY,
    temperature=0,
    model_name="gpt-3.5-turbo-1106"
    )
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="text"
    )

docs = loader.load()
# print(stuff_chain.run(docs))