In [1]:
import os,re
import yt_dlp
import json
import time
import math 
import httplib2
import requests
import pinecone 
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
from bs4.element import Comment
from youtubesearchpython import *
from langchain.llms import OpenAIChat
from bs4 import BeautifulSoup, SoupStrainer
from langchain.vectorstores import Pinecone
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from tqdm.autonotebook import tqdm


## Lext GPT

`Here, we will prepare the VectorDB index for Lex Fridman podcast:`

* Scrape source data from: https://karpathy.ai/lexicap/
* Use Whisper to transcribe episodes that Karpathy has not already done
* Chunk data
* Embed it to Pinecone
* Test VectorDBQA chain on it 
* App (https://lex-gpt.vercel.app/) will read from same Pinecone DB
 
`1. Get video urls -` 

In [None]:
# Videos
channel_id = "UCSHZKyawb77ixDdsGog4iWA" # Get ID from ChannelsSearch
playlist = Playlist(playlist_from_channel_id(channel_id))

# Episode data
stor_metadata=pd.DataFrame()
for v in playlist.videos:
    try:
        ep_number = int(v['title'].split("|")[-1].split("#")[-1])
        stor_metadata.loc[v['title'],'number']=ep_number
        stor_metadata.loc[v['title'],'link']=v['link']
        stor_metadata.loc[v['title'],'title']=v['title']
    except:
        print("Failed on %s", v['title'])

# Filter for newer videos (Karpathy transcribed 1-325)
new_ep = stor_metadata[stor_metadata.number > 325]

`2. Get audio -` 

In [None]:
# Iterate through episodes 
for ix in new_ep.index:
    
    ep_number=int(new_ep.loc[ix,'number'])
    print("EPISODE: %s"%ep_number)
    img_url=new_ep.loc[ix,'img']
    ep_link=new_ep.loc[ix,'link']
    # Write img 
    with open("img/%s.jpg"%str(ep_number), 'wb') as f:
        response = requests.get(img_url)
        f.write(response.content)
    # Write audio
    ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'outtmpl': 'audio/%s.m4a'%str(ep_number),
    'noplaylist': True,
    'postprocessors': [{  
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'm4a',
    }]}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download(ep_link)
        
new_ep.reset_index().to_csv("audio_transcription/episodes.csv")

`3. Run Whisper -`
 
* On GPU, ideally: 10-20 min / video on 2080Ti with `medium` model

In [None]:
! python run_whisper.py

`4. Scrape Karpathy transcriptions -`

In [None]:
# Get text -
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(string=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def get_text_and_title(url):
    html = urllib.request.urlopen(url).read()
    t=(text_from_html(html))
    title=t.split("|")[0].split("back to index")[1].strip()
    return t, title

# Get links -
def get_links(URL):
    http = httplib2.Http()
    status, response = http.request(URL)
    links = []
    for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
        if link.has_attr('href'):
            links.append(link['href'])
    links_clean = [l for l in links if "https" in l]
    return links_clean

# Get image -
def get_img(URL,title,episode_id):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    img_tags = soup.find_all('img')
    urls = [img['src'] for img in img_tags]
    for url in urls:
        response = requests.get(url)
        imgpath="../public/0%s.jpg"%episode_id
        with open(imgpath, 'wb') as f:
            if 'http' not in url:
                url = '{}{}'.format(site, url)
            response = requests.get(url)
            f.write(response.content)
    return imgpath

# Full pipeline - 
def pre_process(URL,episode_id):

    t,title=get_text_and_title(URL)
    links=get_links(URL)
    img=get_img(URL,title,episode_id)
    stor_chunk = pd.DataFrame()
    stor_chunk['chunks']= t.split("link |")
    stor_chunk['clean_chunks']=stor_chunk['chunks'].apply(lambda x: re.sub(r"[^a-zA-Z ]+", '', x)).apply(lambda x: x.strip())
    stor_chunk['links']=links
    all_text = stor_chunk['clean_chunks'].str.cat(sep=' ')
    return all_text, links, title

# Make splits - 
def make_splits(chunks,URL):

    # ID
    episode_id=URL.split("/")[-1].split("-")[0]

    # Pre-processing
    texts,links,title=pre_process(URL,episode_id)
    
    # Splits 
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunks, 
                                                   chunk_overlap=50) 
    texts_recusive = text_splitter.split_text(texts)
    print(len(texts_recusive)) 

    # Metadata 
    N = len(texts_recusive) 
    bins = np.linspace(0, len(links)-1, N, dtype=int)
    sampled_links = [links[i] for i in bins]
    # Here we can add "link", "title", etc that can be fetched in the app 
    metadatas=[{"source":title + " " +link,"id":episode_id,"link":link,"title":title} for link in sampled_links]
    print(len(metadatas))
    return texts_recusive,metadatas,title,episode_id

In [None]:
# Get all pages 
http = httplib2.Http()
status, response = http.request("https://karpathy.ai/lexicap/")
links = []
for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
    if link.has_attr('href'):
        links.append(link['href'])
links_tx = ["https://karpathy.ai/lexicap/"+l for l in links if "0" in l]

In [None]:
# *** Chunk size: key parameter *** 
chunks = 1500
# *** Chunk size: key parameter *** 
splits_scrape = [ ]
metadatas_scrape = [ ]
 
# Iterate 
stor=pd.DataFrame()
for page in links_tx:
    try:
        print("Writing: %s"%page)
        # Make splits
        splits,metadatas,title,episode_id=make_splits(chunks,page)
        stor.loc[episode_id,'title']=title 
        with open('docs/%s.txt'%episode_id, "w") as f:
            for string in splits:
                f.write(string + "\n") 
        f.close()
        with open('metadatas/%s.json'%episode_id, "w") as f:
            json.dump(metadatas, f)
        f.close()
        splits_scrape.append(splits)
        metadatas_scrape.append(metadatas)
    except:
        print("Error on page: %s"%page)

`5. Get newer transcripts -`

In [None]:
# *** Chunk size: key parameter *** 
chunks = 1500
# *** Chunk size: key parameter *** 
splits_new = [ ]
metadatas_new = [ ]

# Read the csv file
new_ep=pd.read_csv("audio_transcription/episodes.csv",index_col=None)

for ix in new_ep.index:

    # Get data
    title=new_ep.loc[ix,'title']
    ep_number=int(new_ep.loc[ix,'number'])
    
    # Consistency w/ convention used in Karpathy transcription
    episode_id="0"+str(ep_number) 
    file_path='audio_transcription/%s.txt'%str(episode_id)
    transcript=pd.read_csv(file_path,sep='\t',header=None)
    transcript.columns=['links','time','chunks']
    
    # Clean text chunks 
    transcript['clean_chunks']=transcript['chunks'].astype(str).apply(lambda x: x.strip())
    links = list(transcript['links'])
    texts = transcript['clean_chunks'].str.cat(sep=' ')

    # Splits 
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunks, 
                                                   chunk_overlap=50) 
    splits = text_splitter.split_text(texts)
    print(len(splits)) 

    # Metadata 
    N = len(splits) 
    bins = np.linspace(0, len(links)-1, N, dtype=int)
    sampled_links = [links[i] for i in bins]
    
    # Here we can add "link", "title", etc that can be fetched in the app 
    metadatas=[{"source":title + " " +link,"id":episode_id,"link":link,"title":title} for link in sampled_links]
    print(len(metadatas)) 

    # Append to output 
    splits_new.append(splits)
    metadatas_new.append(metadatas)

`6. Assemble final list -`

In [None]:
# Join the list of lists 
splits_all = []
for sublist in [splits_scrape+splits_new]:
    splits_all.extend(sublist)
metadatas_all = []
for sublist in [metadatas_scrape+metadatas_new]:
    metadatas_all.extend(sublist)

`7. Embed full dataset in Pinecone VectorDB -`

In [None]:
# Pinecone
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east1-gcp"  
)
index_name = "tax-gpt"
embeddings = OpenAIEmbeddings()

# Initialize with small set of data
p = Pinecone.from_texts(splits_all[0:2], 
                        embeddings, 
                        index_name=index_name, 
                        metadatas=metadatas_all[0:2])

In [None]:
# Add data in chunk to avoid data ingest errors
chunk_size = 100
last_chunk = 0
num_chunks = math.ceil(len(splits_combined) / chunk_size)
for i in range(last_chunk,num_chunks):
    
    print(i)
    start_time = time.time()
    start_idx = i * chunk_size
    end_idx = min(start_idx + chunk_size, len(splits_combined))
    
    # Extract the current chunk
    current_splits = splits_combined[start_idx:end_idx]
    current_metadatas = metadatas_combined[start_idx:end_idx]
    
    # Add the current chunk to the vector database
    p.add_texts(texts = current_splits, metadatas=current_metadatas)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")

`8. Read in VectorDB for testing` 

In [None]:
# Pinecone
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),  
    environment="us-east1-gcp"  
)
index_name = "lex-gpt-new"
embeddings = OpenAIEmbeddings()
p = Pinecone.from_existing_index(index_name=index_name,embedding=embeddings)

`9. Run VectorDBQAWithSourcesChain`

In [None]:
def run_vectordb_sources_chain(llm,query,docstore):

    start_time = time.time()
    chain = VectorDBQAWithSourcesChain.from_chain_type(llm, chain_type="stuff", vectorstore=docstore)
    a = chain({"question": query},return_only_outputs=True)
    print(a["answer"])
    print(a["sources"])
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    print("--------")

llm = OpenAIChat(temperature=0)
q = "What is the future path for AGI?"
run_vectordb_sources_chain(llm,q,p)
