In [None]:
!pip install --upgrade langchain

# Use Gemini to explain: Langchain is a framework that is used to develop applications powered
#  by language models. You are using it in your code to load, index, and query your data.

Collecting langchain
  Downloading langchain-0.2.7-py3-none-any.whl (983 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.6/983.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.12 (from langchain)
  Downloading langchain_core-0.2.12-py3-none-any.whl (355 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m355.8/355.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.84-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.12->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsm

In [None]:
!pip install -U langchain-community
# The -U flag stands for upgrade: upgrade all packages to the newest available version.
# langchain: The core library with fundamental components for building LLM applications.
# langchain-community: Offers a wider range of integrations with external services,
#   additional tools, and experimental features that might not be present
#   in the core langchain library.

Collecting langchain-community
  Downloading langchain_community-0.2.7-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensi

In [None]:
from langchain.document_loaders import CSVLoader

# You need to upload this .csv file into Files for this project at Colab: Upload to session storage
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file)

In [None]:
!pip install -U sentence-transformers
# The sentence-transformers library provides access to a wide range
#  of pre-trained models specifically designed for generating high-quality sentence
#  and text *embeddings*.


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m225.3/227.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)


In [None]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

from langchain.embeddings import HuggingFaceEmbeddings
# HuggingFaceEmbeddings provides a bridge between LangChain and the vast world of Hugging Face Transformer models
# Hugging Face is a platform that provides access to a vast collection of pre-trained
#  Transformer models for various natural language processing (NLP) tasks.
#  These models have achieved state-of-the-art results in areas like text classification,
#  translation, question answering, and more.

# MPNet stands for "Masked and Permuted Pre-training for Language Understanding."
#  It's a Transformer-based architecture that improves upon BERT (Bidirectional Encoder Representations from Transformers)
#  by introducing more effective pre-training techniques.
model_name = "sentence-transformers/all-mpnet-base-v2"

embeddings = HuggingFaceEmbeddings(model_name=model_name)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embed = embeddings.embed_query("Hi")
print(len(embed))
print(embed)

768
[0.033948883414268494, -0.005614397581666708, -0.001218400546349585, 0.015974322333931923, -0.006237417925149202, 0.025682589039206505, -0.0007563922554254532, 0.033906493335962296, 0.04990058392286301, -0.018824921920895576, 0.00296740117482841, -0.005409399513155222, 0.02039126493036747, 0.07431501895189285, 0.013054339215159416, -0.09537391364574432, 0.022820504382252693, -0.01844939962029457, -0.06375963985919952, 0.015191473998129368, -0.015690285712480545, -0.0029217947740107775, 0.00469121104106307, 0.06263818591833115, 0.006039759609848261, -0.012256433255970478, 0.006177693605422974, 0.012311216443777084, 0.00366548215970397, 0.005191429052501917, 0.016769660636782646, 0.007998771965503693, 0.008181723766028881, 0.007717805448919535, 2.122589648934081e-06, -0.012504254467785358, -0.0336025096476078, 0.02028297819197178, 0.021044252440333366, -0.034646064043045044, -0.043434057384729385, 0.0022460618056356907, -0.011851313523948193, 0.04648536071181297, -0.0232009869068861,

In [None]:
!pip install docarray
# Gemini: DocArray is a Python library that allows you to efficiently process and store
#   multi-modal data, such as text, images, audio, and video.
#   It is used by Langchain to store and manage the data you are working with.

Collecting docarray
  Downloading docarray-0.40.0-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.2/270.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting types-requests>=2.28.11.6 (from docarray)
  Downloading types_requests-2.32.0.20240622-py3-none-any.whl (15 kB)
Installing collected packages: types-requests, docarray
Successfully installed docarray-0.40.0 types-requests-2.32.0.20240622


In [None]:
# VectorstoreIndexCreator: creating a vectorstore index, which is a data structure optimized
#  for storing and searching embeddings (numerical representations of text).

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embeddings
).from_loaders([loader])



In [None]:
!pip install transformers
# transformer: the library provides APIs to easily download and use state-of-the-art pre-trained models
#   from Hugging Face.



In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

In [None]:
# "GPT-2" refers to a generative pre-trained Transformer model developed by OpenAI.
#   It's a free large language model with 1.5 billion parameters,
#   trained on a massive dataset of text and code.
MODEL_NAME = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.1,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)
# Gemini: In your code, you're using HuggingFacePipeline to wrap a text generation pipeline based on
#  the GPT-2 model. This enables you to generate text responses within your LangChain
#  application by leveraging the capabilities of the GPT-2 model from Hugging Face.

# Now we can combines the power of the Vectorstore Index
#   and a language model to generate insightful responses to user queries.
query = "Please suggest a shirt with sunblocking"
response = index.query(query, llm=llm)
print(response)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

: 255
name: Sun Shield Shirt by
description: "Block the sun, not the fun – our high-performance sun shirt is guaranteed to protect from harmful UV rays. 

Size & Fit: Slightly Fitted: Softly shapes the body. Falls at hip.

Fabric & Care: 78% nylon, 22% Lycra Xtra Life fiber. UPF 50+ rated – the highest rated sun protection possible. Handwash, line dry.

Additional Features: Wicks moisture for quick-drying comfort. Fits comfortably over your favorite swimsuit. Abrasion resistant for season after season of wear. Imported.

Sun Protection That Won't Wear Off
Our high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun's harmful rays. This fabric is recommended by The Skin Cancer Foundation as an effective UV protectant.

: 679
name: Women's Tropical Tee, Sleeveless
description: Our five-star sleeveless b

In [None]:
query ="Please list all your shirts with sun protection \
in a table in markdown and summarize each one."

response = index.query(query, llm=llm)
print(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

: 679
name: Women's Tropical Tee, Sleeveless
description: Our five-star sleeveless button-up shirt has a fit to flatter and SunSmart™ protection to block the sun’s harmful UV rays. Size & Fit: Slightly Fitted: Softly shapes the body. Falls at hip. Fabric & Care: Shell: 71% nylon, 29% polyester. Cape lining: 100% polyester. Built-in SunSmart™ UPF 50+ rated – the highest rated sun protection possible. Machine wash and dry. Additional Features: Updated design with smoother buttons. Wrinkle resistant. Low-profile pockets and side shaping offer a more flattering fit. Front and back cape venting. Two front pockets, tool tabs and eyewear loop. Imported. Sun Protection That Won't Wear Off: Our high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun's harmful rays.

: 255
name: Sun Shield Shirt by
description

In [None]:
from IPython.display import display, Markdown

In [None]:
display(Markdown(response))


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

: 679
name: Women's Tropical Tee, Sleeveless
description: Our five-star sleeveless button-up shirt has a fit to flatter and SunSmart™ protection to block the sun’s harmful UV rays. Size & Fit: Slightly Fitted: Softly shapes the body. Falls at hip. Fabric & Care: Shell: 71% nylon, 29% polyester. Cape lining: 100% polyester. Built-in SunSmart™ UPF 50+ rated – the highest rated sun protection possible. Machine wash and dry. Additional Features: Updated design with smoother buttons. Wrinkle resistant. Low-profile pockets and side shaping offer a more flattering fit. Front and back cape venting. Two front pockets, tool tabs and eyewear loop. Imported. Sun Protection That Won't Wear Off: Our high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun's harmful rays.

: 255
name: Sun Shield Shirt by
description: "Block the sun, not the fun – our high-performance sun shirt is guaranteed to protect from harmful UV rays. 

Size & Fit: Slightly Fitted: Softly shapes the body. Falls at hip.

Fabric & Care: 78% nylon, 22% Lycra Xtra Life fiber. UPF 50+ rated – the highest rated sun protection possible. Handwash, line dry.

Additional Features: Wicks moisture for quick-drying comfort. Fits comfortably over your favorite swimsuit. Abrasion resistant for season after season of wear. Imported.

Sun Protection That Won't Wear Off
Our high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun's harmful rays. This fabric is recommended by The Skin Cancer Foundation as an effective UV protectant.

: 374
name: Men's Plaid Tropic Shirt, Short-Sleeve
description: Our Ultracomfortable sun protection is rated to UPF 50+, helping you stay cool and dry. Originally designed for fishing, this lightest hot-weather shirt offers UPF 50+ coverage and is great for extended travel. SunSmart technology blocks 98% of the sun's harmful UV rays, while the high-performance fabric is wrinkle-free and quickly evaporates perspiration. Made with 52% polyester and 48% nylon, this shirt is machine washable and dryable. Additional features include front and back cape venting, two front bellows pockets and an imported design. With UPF 50+ coverage, you can limit sun exposure and feel secure with the highest rated sun protection available.

: 619
name: Tropical Breeze Shirt
description: Beat the heat in this lightweight, breathable long-sleeve men’s UPF shirt, offering superior SunSmart™ protection from the sun’s harmful rays. The wrinkle-resistant and moisture-wicking fabric keeps you cool and comfortable.

Size & Fit

Traditional Fit: Relaxed through the chest, sleeve and waist.

Why We Love It

When you spend a lot of time outdoors, limiting sun exposure is important. Originally designed for fishing, our lightest hot-weather shirt offers UPF 50+ coverage and is also a great choice for extended travel. Innovative SunSmart technology blocks 98% of the sun's harmful UV rays. The high-performance fabric is wrinkle free, dries quickly and keeps you cool by wicking perspiration away from your skin.

Fabric & Care

Shell: 71% nylon, 29% polyester.
Cape lining: 100% polyester.
Polyester-mesh inserts.
UPF 50+ rated – the highest rated sun protection possible.
Machine wash and dry.

Additional Features

Question: Please list all your shirts with sun protection in a table in markdown and summarize each one.
Helpful Answer: Yes! You will find answers on every page here.

In [None]:
# modify the query with more detailed requirements, but GPT2 did not do a nice job though as you can see from response

query ="Please list all your shirts with sun protection \
in a table in markdown with two columns: Name and Description. Summarize each shirt's features in descriptin with up to 20 words."

response = index.query(query, llm=llm)
print(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

: 679
name: Women's Tropical Tee, Sleeveless
description: Our five-star sleeveless button-up shirt has a fit to flatter and SunSmart™ protection to block the sun’s harmful UV rays. Size & Fit: Slightly Fitted: Softly shapes the body. Falls at hip. Fabric & Care: Shell: 71% nylon, 29% polyester. Cape lining: 100% polyester. Built-in SunSmart™ UPF 50+ rated – the highest rated sun protection possible. Machine wash and dry. Additional Features: Updated design with smoother buttons. Wrinkle resistant. Low-profile pockets and side shaping offer a more flattering fit. Front and back cape venting. Two front pockets, tool tabs and eyewear loop. Imported. Sun Protection That Won't Wear Off: Our high-performance fabric provides SPF 50+ sun protection, blocking 98% of the sun's harmful rays.

: 255
name: Sun Shield Shirt by
description