In [1]:
#############################################################
# # Document Loading
#############################################################

In [2]:
#############################################################
# ## Note to students.
# - During periods of high load you may find the 
#   notebook unresponsive. 
# - It may appear to execute a cell, update the completion 
#   number in brackets [#] at the left of the cell but you 
#   may find the cell has not executed. 
# - This is particularly obvious on print statements 
#   when there is no output. If this happens, restart 
#   the kernel using the command under the Kernel tab.
#############################################################

In [3]:
#############################################################
# Retrieval Rugmented Generation
#  
# - In Retrieval Augmented Generation (RAG), an 
#   LLM retrieves contextual documents from an external 
#   dataset as part of its execution. 
#   + This is useful if we want to ask question about specific 
#     documents 
#
#       + PDF
#       + Youtube
#       + URL (web pages)
#       + Notion (database structured data)
#
# - The purpose of document loaders is to take this variety 
#   of data sources and load them into a standard document
#   object, which consists of 
#   + content 
#   + associated metadata.
#############################################################

In [4]:
# ![overview.jpeg](attachment:overview.jpeg)

In [5]:
! pip install langchain



In [7]:
import os
from openai import OpenAI
# read the api key from environment variable
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [8]:
! pip install --upgrade langchain
! pip install langchain-community



In [9]:
#############################################################
# 1. PDFs
# 
# Let's load a PDF (SFBU catalogue)
#############################################################

In [11]:

from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain.document_loaders.parsers.pdf import PyPDFParser

# Import other parsers as needed

loader = PyPDFLoader('2023Catalog.pdf')
pages = loader.load_and_split()

In [12]:
# Each page is a `Document`.
# 
# A `Document` contains 
# - text (`page_content`) 
# - `metadata`.

In [13]:
len(pages)
page = pages[0]
print(page.page_content[0:500])

Catalog 202 3 i ver. 202 3.09.24 
161 Mission Falls Lane, Fremont, CA 94539  
Tel: (510) 803-SFBU ( 7328); e -mail: admissions@sfbu.edu  
 
 
2023 CATALOG                           JAN 1 - DEC 31, 2023


In [14]:
page.metadata

{'source': '2023Catalog.pdf', 'page': 0}

In [15]:
#############################################################
# 2. YouTube
#############################################################

In [16]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [None]:
! pip install yt_dlp
! pip install pydub

In [None]:
!pip install ffmpeg
! pip install ffprobe

In [17]:
url="https://www.youtube.com/watch?v=kuZNIvdwnMc"

save_dir="docs/youtube/"

loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)

In [18]:
docs = loader.load()
docs[0].page_content[0:500]


[youtube] Extracting URL: https://www.youtube.com/watch?v=kuZNIvdwnMc
[youtube] kuZNIvdwnMc: Downloading webpage
[youtube] kuZNIvdwnMc: Downloading ios player API JSON
[youtube] kuZNIvdwnMc: Downloading android player API JSON
[youtube] kuZNIvdwnMc: Downloading m3u8 information
[info] kuZNIvdwnMc: Downloading 1 format(s): 140
[download] docs/youtube//San Francisco Bay University MBA Student Spotlight： John Odebode.m4a has already been downloaded
[download] 100% of   10.20MiB
[ExtractAudio] Not converting audio docs/youtube//San Francisco Bay University MBA Student Spotlight： John Odebode.m4a; file is already in target format m4a
Transcribing part 1!


"My name is John, John Odebode. I am studying for an MBA program here at SFBU. It's my final trimester at SFBU and I will be graduating in two weeks. I am from Nigeria. I studied at the University of Lagos for my first degree in philosophy. I also studied for my first master's degree in philosophy as well at the same university. I have been practicing within the supply chain industry for the past six years. I have spent the most part of my career at ExxonMobil and I recently completed a six-month"

In [None]:
#############################################################
# 3. URLs
#############################################################

In [22]:
from langchain_community.document_loaders.web_base import WebBaseLoader

loader = WebBaseLoader("https://www.sfbu.edu/admissions/student-health-insurance")


docs = loader.load()


print(docs[0].page_content[:500])






















Student Health Insurance








































      Skip to main content
    


 















Main navigation


About Us


Overview


University Leadership


Strategic Plan


Accreditation


Policies


Careers




Admissions


Requirements for Degree Programs


Requirements for Intensive English Program


Scholarships


Tuition & Costs


Articulation & Transfer Agreements


Contact Admissions Team


Admitted Students




Academics


School of Business




In [23]:
#############################################################
# 4. Notion
#
# - Notion is a really popular store of both personal 
#   and company data, and a lot of people have created 
#   chatbots talking to their Notion databases.
# - Follow steps [here](https://python.langchain.com/docs/modules/
# data_connection/document_loaders/integrations/notion) for an 
# example Notion site such as [this one](https://yolospace.notion.
# site/Blendle-s-Employee-Handbook-
# e31bff7da17346ee99f531087d8b133f):
# 
# - Duplicate the page into your own Notion space and 
#   export as `Markdown / CSV`.
# - Unzip it and save it as a folder that contains the 
#   markdown file for the Notion page.
#############################################################

# ![image.png](./img/image.png)

# In[ ]:



In [34]:
from langchain_community.document_loaders.notion import NotionDirectoryLoader
loader = NotionDirectoryLoader("2f1f7b2e-7138-4af0-8891-50e04ad56454_Export-326004c6-1ce0-4c4f-86d9-3149769bd3ba/Blendle's Employee Handbook c47cde56a3924610833bb8a8ff4dacb8")
docs = loader.load()

print(docs[0].page_content[0:200])

# Your 1st month

Hey you! Welcome to Blendle. Buckle up, you're in for one hell of a ride :). 

The faster you get settled in the better, so we came up with a structure for your first month to make s


In [35]:
docs[0].metadata

{'source': "2f1f7b2e-7138-4af0-8891-50e04ad56454_Export-326004c6-1ce0-4c4f-86d9-3149769bd3ba/Blendle's Employee Handbook c47cde56a3924610833bb8a8ff4dacb8/Your 1st month 55783656b7d24e219627b9ed43ef74cd.md"}