In [2]:
import os
import re
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import plotly.graph_objs as go
import numpy as np
import pandas as pd
from dash import Dash, dcc, html, Input, Output, State
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
url = "https://www.paulgraham.com/worked.html"
response = requests.get(url)

In [15]:
essay_text = re.search(r'<font.*?>(.*?)</font>', response.text, re.DOTALL).group(1)
essay_text = re.sub(r'<.*?>', '', essay_text)

In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100)
chunks = text_splitter.split_text(essay_text)

In [17]:
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts(chunks, embeddings)

In [18]:
vectorstore.save_local("faiss_index")

In [19]:
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())

all_embeddings = vectorstore.embedding_function.embed_documents(chunks)

  warn_deprecated(


In [20]:
pca = PCA(n_components=2)
all_embeddings_2d = pca.fit_transform(all_embeddings)

In [25]:
import pickle

In [26]:
all_embeddings

[[0.0029853239405316485,
  0.007341928245316849,
  0.005255482443420023,
  -0.016625958461539556,
  -0.005590101074617716,
  0.010215712331675057,
  -0.03902572976736575,
  -0.01498567021369124,
  -0.00920529527018942,
  -0.018922359773352902,
  0.035640174585388235,
  0.043146132028018966,
  0.002222590035667348,
  -0.0031001441458207092,
  0.005681956959452178,
  0.0065381875067853906,
  0.032805757506385355,
  -0.013778418115428945,
  0.0005761511003734082,
  -0.01150825961254057,
  -0.003664402954765869,
  -0.005773813309947951,
  -0.00453703642183047,
  -0.006229813081496605,
  -0.0003381043389521099,
  -0.001992949625089226,
  0.019972144773516495,
  -0.026953210647388055,
  0.025063598019622636,
  -0.006994187278334044,
  0.018174388496246852,
  -0.02662515225276029,
  0.002496518096689562,
  -0.02705818933366894,
  -0.021179395940456164,
  -0.02323959893342802,
  -0.021835512729711692,
  0.00020964929394782857,
  0.0022964029416279387,
  -0.016507857439473562,
  0.0149463032063

In [33]:
with open("chunks.pickle", "wb") as f:
    pickle.dump(chunks, f)


In [34]:
with open("chunks.pickle", "rb") as f:
    chunks1 = pickle.load(f)


In [35]:
chunks1

["February 2021Before college the two main things I worked on, outside of school,\nwere writing and programming. I didn't write essays. I wrote what\nbeginning writers were supposed to write then, and probably still\nare: short stories. My stories were awful. They had hardly any plot,",
 'are: short stories. My stories were awful. They had hardly any plot,\njust characters with strong feelings, which I imagined made them\ndeep.The first programs I tried writing were on the IBM 1401 that our\nschool district used for what was then called "data processing."',
 'school district used for what was then called "data processing."\nThis was in 9th grade, so I was 13 or 14. The school district\'s\n1401 happened to be in the basement of our junior high school, and\nmy friend Rich Draves and I got permission to use it. It was like',
 "my friend Rich Draves and I got permission to use it. It was like\na mini Bond villain's lair down there, with all these alien-looking\nmachines \x97 CPU, disk driv

In [36]:
chunks

["February 2021Before college the two main things I worked on, outside of school,\nwere writing and programming. I didn't write essays. I wrote what\nbeginning writers were supposed to write then, and probably still\nare: short stories. My stories were awful. They had hardly any plot,",
 'are: short stories. My stories were awful. They had hardly any plot,\njust characters with strong feelings, which I imagined made them\ndeep.The first programs I tried writing were on the IBM 1401 that our\nschool district used for what was then called "data processing."',
 'school district used for what was then called "data processing."\nThis was in 9th grade, so I was 13 or 14. The school district\'s\n1401 happened to be in the basement of our junior high school, and\nmy friend Rich Draves and I got permission to use it. It was like',
 "my friend Rich Draves and I got permission to use it. It was like\na mini Bond villain's lair down there, with all these alien-looking\nmachines \x97 CPU, disk driv

In [37]:
question = "what did paul worked on when he was young?"
answer = qa_chain.run(question)
        
docs = vectorstore.similarity_search(question, k=3)
doc_texts = [doc.page_content for doc in docs]

question_embedding = embeddings.embed_documents([question])[0]
answer_embedding = embeddings.embed_documents([answer])[0]

qa_embeddings_2d = pca.transform([question_embedding, answer_embedding])

distances = euclidean_distances([question_embedding], all_embeddings)[0]
print(distances)

  warn_deprecated(


[0.65407089 0.7162729  0.69283558 0.72391553 0.71764676 0.75430794
 0.75770951 0.73771951 0.7560863  0.78855169 0.75260172]


In [44]:
all_embeddings_2d

array([[ 0.39521756, -0.10991936],
       [ 0.24340232, -0.06134303],
       [ 0.16895833,  0.25359957],
       [-0.0075801 ,  0.32382845],
       [-0.08740072,  0.12179391],
       [-0.16809493, -0.15183698],
       [-0.12854557, -0.13961104],
       [-0.01013657, -0.2172082 ],
       [-0.07823244, -0.20048703],
       [-0.13274349,  0.04141237],
       [-0.19484438,  0.13977132]])

In [40]:
max_distance = np.max(distances)
normalized_sizes = 1 - (distances / max_distance)
point_sizes = normalized_sizes * 18 + 7

In [41]:
df = pd.DataFrame(all_embeddings_2d, columns=['x', 'y'])
df['type'] = 'Corpus'
df['size'] = point_sizes
df['text'] = chunks

In [43]:
df.head()

Unnamed: 0,x,y,type,size,text
0,0.395218,-0.109919,Corpus,10.069747,February 2021Before college the two main thing...
1,0.243402,-0.061343,Corpus,8.649883,are: short stories. My stories were awful. The...
2,0.168958,0.2536,Corpus,9.184879,school district used for what was then called ...
3,-0.00758,0.323828,Corpus,8.475427,my friend Rich Draves and I got permission to ...
4,-0.087401,0.121794,Corpus,8.618523,"machines  CPU, disk drives, printer, card rea..."


In [45]:
qa_df = pd.DataFrame(qa_embeddings_2d, columns=['x', 'y'])
qa_df['type'] = ['Question', 'Answer']
qa_df['size'] = 10
qa_df['text'] = [question, answer]
df = pd.concat([df, qa_df], ignore_index=True)

In [47]:
df.head(20)

Unnamed: 0,x,y,type,size,text
0,0.395218,-0.109919,Corpus,10.069747,February 2021Before college the two main thing...
1,0.243402,-0.061343,Corpus,8.649883,are: short stories. My stories were awful. The...
2,0.168958,0.2536,Corpus,9.184879,school district used for what was then called ...
3,-0.00758,0.323828,Corpus,8.475427,my friend Rich Draves and I got permission to ...
4,-0.087401,0.121794,Corpus,8.618523,"machines  CPU, disk drives, printer, card rea..."
5,-0.168095,-0.151837,Corpus,7.78167,"type programs on punch cards, then stack them ..."
6,-0.128546,-0.139611,Corpus,7.704024,loud printer.I was puzzled by the 1401. I coul...
7,-0.010137,-0.217208,Corpus,8.160329,and I didn't have any data stored on punched c...
8,-0.078232,-0.200487,Corpus,7.741076,interesting of that type. So I'm not surprised...
9,-0.132743,0.041412,Corpus,7.0,"to terminate, when one of mine didn't. On a ma..."


In [51]:
retrieved_indices = [chunks.index(doc) for doc in doc_texts]
df.loc[retrieved_indices, 'type'] = 'Retrieved'

2