In [16]:
from dotenv import load_dotenv,find_dotenv
import os
import sys
import certifi
from langchain import HuggingFaceHub, PromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS


In [2]:
load_dotenv(find_dotenv())

True

In [3]:
# repo_id = "meta-llama/Llama-2-7b"
repo_id = "distilgpt2" # use as fast, in the future you can sub out for another model

In [4]:
# bypass VPN to connect to hugging face hub
os.environ['CURL_CA_BUNDLE'] = ''

llm = HuggingFaceHub(repo_id=repo_id)
llm("explain large language models in one sentence")



' [20] in which the word is defined and can be substituted for another [20] in which the word is defined and can be substituted for another [20] in which the word is defined and can be'

In [5]:
template = """
You are an expert data scientist with an expertise in building deep learning models. 
Explain the concept of {concept} in a couple of lines
"""

prompt = PromptTemplate(
    input_variables=["concept"],
    template=template,
)

llm(prompt.format(concept="autoencoder"))



'We can understand what data is expected to turn into, or actually, what type of'

In [6]:
chain = LLMChain(llm=llm, prompt=prompt)

# Run the chain only specifying the input variable.
print(chain.run("autoencoder"))



We can understand what data is expected to turn into, or actually, what type of


In [7]:
second_prompt = PromptTemplate(
    input_variables=["ml_concept"],
    template="Turn the concept description of {ml_concept} and explain it to me like I'm five in 500 words",
)
chain_two = LLMChain(llm=llm, prompt=second_prompt)

In [8]:
overall_chain = SimpleSequentialChain(chains=[chain, chain_two], verbose=True)

# Run the chain specifying only the input variable for the first chain.
explanation = overall_chain.run("autoencoder")
print(explanation)



[1m> Entering new SimpleSequentialChain chain...[0m




[36;1m[1;3mWe can understand what data is expected to turn into, or actually, what type of[0m
[33;1m[1;3m. It's going to be a big challenge of my career to understand and understand[0m

[1m> Finished chain.[0m
. It's going to be a big challenge of my career to understand and understand




In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap  = 0,
)

texts = text_splitter.create_documents([explanation])
print(texts[0].page_content)

. It's going to be a big challenge of my career to understand and understand


In [10]:
embeddings = HuggingFaceEmbeddings()

query_result = embeddings.embed_query(texts[0].page_content)
print(query_result)

[0.035429179668426514, -0.011283202096819878, -0.022511638700962067, -0.018519684672355652, -0.015426118858158588, 0.04833979159593582, -0.027479685842990875, 3.43602332577575e-05, -0.034070756286382675, 0.04169641062617302, 0.046859100461006165, -0.05319862440228462, 0.00027740269433707, 0.028727559372782707, 0.03073274902999401, -0.04137694835662842, -0.007983139716088772, -0.01069630403071642, -5.012617111788131e-05, -0.025005828589200974, -0.01904699206352234, 0.03595907986164093, -0.024159707129001617, -0.0034354168456047773, -0.033186305314302444, -0.07761255651712418, 0.04277373477816582, 0.0424821712076664, -0.043091438710689545, -0.038935527205467224, -0.009206104092299938, 0.0026595850940793753, 0.03943780064582825, 0.1217031180858612, 2.2645731405646075e-06, -0.03672739118337631, -0.05807163938879967, -0.026672987267374992, -0.09400948882102966, 0.027830012142658234, 0.021247291937470436, -0.000914651551283896, -0.038085851818323135, -0.012755926698446274, 0.0026833356823772

In [18]:
db = FAISS.from_documents(texts, embeddings)

In [23]:
db.save_local("faiss_index")

In [19]:
query = "What is magical about an autoencoder?"
result = db.similarity_search(query)

print(result)

[Document(page_content=". It's going to be a big challenge of my career to understand and understand", metadata={})]
