In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

## Document Loaders
LangChain can load data from many sources:
* Websites.
* Databases.
* Youtube, Twitter.
* Excel, Pandas, Notion, Figma, HuggingFace, Github, Etc.

LangChain can load data of many types:
* PDF.
* HTML.
* JSON.
* Word, Powerpoint, etc.

**Sometimes you will have to clean or prepare the data you load before you can use it.**
<br>
This is something Data Scientist are used to do.

## Loading PDF documents

In [2]:
# !pip install pypdf

In [3]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/5pages.pdf")
pages = loader.load()

In [4]:
len(pages)

4

In [5]:
page = pages[0]

In [6]:
print(page.page_content[0:500])

Page 1 of 4 PDF Files 
Scan – Create – Reduce File Size  
 
 
It is recommended that you purchase an Adobe Acrobat product that 
allows you to read, create and manipulate PDF documents.  Go to http://www.adobe.com/products/acrobat/matrix.html
 to compare 
Adobe products and features –Adobe  Acrobat Standard is sufficient. 
 
 
Scanning Documents 
 
You should only have to scan docu ments that are not electronic, and 
when you are unable to create a PDF using PDFMaker or the Print 
Command from t


In [7]:
page.metadata

{'source': 'data/5pages.pdf', 'page': 0}

## Loading YouTube Audio

In [8]:
#from langchain.document_loaders.generic import GenericLoader
#from langchain.document_loaders.parsers import OpenAIWhisperParser
#from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# !pip install yt_dlp
# !pip install pydub

**The following code was not working anymore, see the changes importing now the langchain_community module. Many thanks to Isabel González for the updates, you are on the right way to become an Honor Student!**

In [9]:
# url="https://www.youtube.com/watch?v=Rb9Bpw8yvTg"
# save_dir="data/youtube/"
# loader = GenericLoader(
#     YoutubeAudioLoader([url],save_dir),
#     OpenAIWhisperParser()
# )
# docs = loader.load()

In [10]:
# docs[0].page_content[0:500]

## Loading websites

**Option 1: Web Base Loader**

In [11]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://aiaccelera.com/100-ai-startups-100-llm-apps-that-have-earned-500000-before-their-first-year-of-existence/")

In [12]:
docs = loader.load()

In [13]:
print(docs[0].page_content[:2000])











"100 AI Startups": 100 LLM Apps that have earned $500,000 before their first year of existence
























































 




















                                    info@aiaccelera.com                                





Spanish:
  



















0








Services  

Learn to create LLM Applications
Development of Enterprise LLM Apps
Development of LLM Apps for Startups
AI Consulting for Businesses
In-Company AI Training
Building your Team of AI Professionals
We are looking for Entrepreneurs
AI Sandbox: Developers + Entrepreneurs


Use Cases
Resources
Community  

Management Team
AI Hall of Fame
AI Experts Council
AI Investors Council


Partners
Jobs
Contact
 














 



“100 AI Startups”: 100 LLM Apps that have earned $500,000

Homeblog“100 AI Startups”: 100 LLM Apps that have earned $500,000 











 




Posted on October 5, 2023
 

AI Accelera


No Comments













The new book “100 AI Startups” by Julio 

**Option 2: Unstructured HTML Loader**

In [23]:
# !pip install unstructured

In [15]:
from langchain.document_loaders import UnstructuredHTMLLoader

In [16]:
loader = UnstructuredHTMLLoader("data/_100 AI Startups__ 100 LLM Apps that have earned $500,000 before their first year of existence.html")

In [17]:
data = loader.load()

In [18]:
data

[Document(page_content='Posted on October 5, 2023\n\nAI Accelera\n\nNo Comments\n\nThe new book “100 AI Startups” by Julio Colomer presents 100 new Artificial Intelligence companies that have earned at least $500,000 in their first year of existence and are radically changing how things are done in over 30 sectors, including banking, insurance, health, education, legal, logistics, marketing, sales, customer service, and even in public administration.\n\nThese 100 startups from Silicon Valley show how the fastest-growing area among the new Artificial Intelligence companies are the new LLM Apps (LLM Apps), which are changing every sector and areas of the company.\n\nIt’s not science fiction. There are already startups with LLM Applications that have earned more than $500,000 before their first year of existence by doing this:\n\nImproving the effectiveness of surgeons.\n\nCustomer service in banks.\n\nTutoring for students.\n\nLegal recommendations.\n\nStock analysis.\n\nDelivering physi

**Option 3: Beautiful Soup**

In [None]:
#!pip install beautifulsoup4

In [24]:
from langchain_community.document_loaders import WebBaseLoader

In [25]:
loader = WebBaseLoader("https://aiaccelera.com/ai-consulting-for-businesses/")
data = loader.load()
data

[Document(page_content="\n\n\n\n\n\n\n\n\n\nAI Consulting for businesses\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                    info@aiaccelera.com                                \n\n\n\n\n\nSpanish:\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n0\n\n\n\n\n\n\n\n\nServices  \n\nLearn to create LLM Applications\nDevelopment of Enterprise LLM Apps\nDevelopment of LLM Apps for Startups\nAI Consulting for Businesses\nIn-Company AI Training\nBuilding your Team of AI Professionals\nWe are looking for Entrepreneurs\nAI Sandbox: Developers + Entrepreneurs\n\n\nUse Cases\nResources\nCommunity  \n\nManagement Team\nAI Hall of Fame\nAI Experts Council\nAI Investors Council\n\n\nPartners\nJobs\nContact\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\nAI Consulting for Businesses\n\nHomeAI Consulting for Businesses \n\n\n \n\n\n\n\n\n\n\n\n\n\n\nAI Consulting for Businesses \n\n\n\n\n\n\n