## `vector_search.ipynb` - работа с векторной базой данных и векторным поиском

**В данном блокноте происходит преобразование описаний фильмов в эмбеддинги с последующим сохранением в векторной базе данных**

In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import pandas as pd

movies = pd.read_csv("../data/movies_cleaned.csv")
movies

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,tagged_overview
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...","tt1375666 Cobb, a skilled thief who commits co..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",tt0816692 The adventures of a group of explore...
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",tt0468569 Batman raises the stakes in his war ...
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...","tt0499549 In the 22nd century, a paraplegic Ma..."
4,24428,The Avengers,7.710,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",tt0848228 When an unexpected enemy emerges and...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1223,529203,The Croods: A New Age,7.521,3508,Released,2020-11-25,215905815,95,False,/ytTQoYkdpsgtfDWrNFCei8Mfbxu.jpg,...,"Searching for a safer habitat, the prehistoric...",63.456,/tbVZ3Sq88dZaCANlUcewQuHQOaE.jpg,The future ain't what it used to be.,"Animation, Family, Adventure, Fantasy, Comedy","Universal Pictures, DreamWorks Animation",United States of America,English,"sequel, prehistory, candid, playful, joyous, a...","tt2850386 Searching for a safer habitat, the p..."
1224,9392,The Descent,6.957,3507,Released,2005-07-08,57130027,99,False,/70TIOrfkQli0Smsfjua2McaDPci.jpg,...,"After a tragic accident, six friends reunite f...",23.007,/mxFPI4KYBk5ri9cPteIS8jiDFgj.jpg,Scream your last breath.,"Adventure, Horror","Pathé, Celador Films, Northmen Productions",United Kingdom,English,"panic, darkness, mutant, expedition, cave, cla...","tt0435625 After a tragic accident, six friends..."
1225,12153,White Chicks,6.919,3505,Released,2004-06-23,113086475,109,False,/di47xqYMCYpjqwnqNlO17X5qXMX.jpg,...,"Two FBI agent brothers, Marcus and Kevin Copel...",54.851,/aHTUpo45qy9QYIOnVITGGqLoVcA.jpg,They're going deep undercover.,"Comedy, Crime","Columbia Pictures, Revolution Studios, Wayans ...",United States of America,English,"undercover, fbi, cross dressing, car accident,...","tt0381707 Two FBI agent brothers, Marcus and K..."
1226,2832,Identity,7.180,3502,Released,2003-04-25,90259536,90,False,/7MwDOMrbjrKP3XQ5vw4cgB2DPaF.jpg,...,Complete strangers stranded at a remote desert...,27.747,/bnidwEvWNAVJ3Uco9wWtuzWAfrx.jpg,The secret lies within.,"Mystery, Thriller","Konrad Pictures, Columbia Pictures",United States of America,English,"prostitute, prisoner, psychopath, nevada, dete...",tt0309698 Complete strangers stranded at a rem...


**Сохраняем столбец `tagged_overview` в текстовый файл**

In [4]:
movies["tagged_overview"].to_csv(
    "../data/tagged_overview.txt",
    sep="\n",
    index=False,
    header=False
)

**Подготовка тектовых данных для векторного поиска**

In [5]:
raw_documents = TextLoader("../data/tagged_overview.txt", encoding="utf-8").load()
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)

Created a chunk of size 294, which is longer than the specified 1
Created a chunk of size 208, which is longer than the specified 1
Created a chunk of size 406, which is longer than the specified 1
Created a chunk of size 185, which is longer than the specified 1
Created a chunk of size 307, which is longer than the specified 1
Created a chunk of size 344, which is longer than the specified 1
Created a chunk of size 500, which is longer than the specified 1
Created a chunk of size 309, which is longer than the specified 1
Created a chunk of size 174, which is longer than the specified 1
Created a chunk of size 249, which is longer than the specified 1
Created a chunk of size 248, which is longer than the specified 1
Created a chunk of size 416, which is longer than the specified 1
Created a chunk of size 142, which is longer than the specified 1
Created a chunk of size 136, which is longer than the specified 1
Created a chunk of size 398, which is longer than the specified 1
Created a 

In [6]:
documents[0]

Document(metadata={'source': '../data/tagged_overview.txt'}, page_content='"tt1375666 Cobb, a skilled thief who commits corporate espionage by infiltrating the subconscious of his targets is offered a chance to regain his old life as payment for a task considered to be impossible: ""inception"", the implantation of another person\'s idea into a target\'s subconscious."')

**Импортируем модель для создания эмбеддингов**

In [7]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

  embeddings = HuggingFaceEmbeddings(


**Создаем векторную базу данных**

In [8]:
db_movies = Chroma.from_documents(
    documents,
    embedding=embeddings
)

**Выполняем семантический поиск в векторной базе данных по запросу**

In [9]:
query = "A movie about traveling in space"
docs = db_movies.similarity_search(query, k=5)
docs

[Document(id='54b1aacd-26dc-4d5b-85bd-6e7f9409282f', metadata={'source': '../data/tagged_overview.txt'}, page_content='"tt0133240 When space galleon cabin boy Jim Hawkins discovers a map to an intergalactic ""loot of a thousand worlds,"" a cyborg cook named John Silver teaches him to battle supernovas and space storms on their journey to find treasure."'),
 Document(id='50e2f89a-8686-4bfd-acdf-895df557b900', metadata={'source': '../data/tagged_overview.txt'}, page_content='tt1355644 A spacecraft traveling to a distant colony planet and transporting thousands of people has a malfunction in its sleep chambers. As a result, two passengers are awakened 90 years early.'),
 Document(id='6858c847-c7ba-47a2-99dd-cdfdfb7b9dd9', metadata={'source': '../data/tagged_overview.txt'}, page_content='tt1454468 Dr. Ryan Stone, a brilliant medical engineer on her first Shuttle mission, with veteran astronaut Matt Kowalsky in command of his last flight before retiring. But on a seemingly routine spacewalk

**Находим строку в датафрейме, которая соответствует первому результату семантического поиска**

In [10]:
movies[movies["imdb_id"] == (docs[0].page_content.split()[0].strip())]

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,tagged_overview


**Функция `retrieve_semantic_recommendations` выполняет поиск _top_k_ фильмов по запросу и возвращает _top_k_ фильмов**

In [11]:
def retrieve_semantic_recommendations(query: str, top_k: int = 5) -> pd.DataFrame:
    recs = db_movies.similarity_search(query, k=10)

    movies_list = []

    for i in range(0, len(recs)):
        movies_list += [recs[i].page_content.split()[0]]

    return movies[movies["imdb_id"].isin(movies_list)].head(top_k)

In [12]:
retrieve_semantic_recommendations(query)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,tagged_overview
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",tt0816692 The adventures of a group of explore...
120,49047,Gravity,7.162,14463,Released,2013-10-03,723192705,91,False,/a2n6bKD7qhCPCAEALgsAhWOAQcc.jpg,...,"Dr. Ryan Stone, a brilliant medical engineer o...",31.024,/kZ2nZw8D681aphje8NJi8EfbL1U.jpg,Don't Let Go,"Science Fiction, Thriller, Drama","Esperanto Filmoj, Warner Bros. Pictures, Heyda...","United Kingdom, United States of America",English,"space mission, loss, space, astronaut, space s...","tt1454468 Dr. Ryan Stone, a brilliant medical ..."
166,274870,Passengers,6.934,12429,Released,2016-12-21,303144152,116,False,/gHz4ZQytRs8YGrqFMwB3Vrr8pig.jpg,...,A spacecraft traveling to a distant colony pla...,27.272,/jK9S6HANSf2no64v1x1HxfcpmcA.jpg,There is a reason they woke up.,"Drama, Romance, Science Fiction","Wanda Pictures, Columbia Pictures, Village Roa...",United States of America,English,"android, spacecraft, asteroid, isolation, show...",tt1355644 A spacecraft traveling to a distant ...
188,165,Back to the Future Part II,7.752,11817,Released,1989-11-22,332000000,108,False,/su0cmtK55eKXq0QjW68LQslUhUY.jpg,...,Marty and Doc are at it again in this wacky se...,30.516,/hQq8xZe5uLjFzSBt4LanNP7SQjl.jpg,"Roads? Where we're going, we don't need roads!","Adventure, Comedy, Science Fiction","Universal Pictures, Amblin Entertainment",United States of America,English,"flying car, skateboarding, car race, lightning...",tt0096874 Marty and Doc are at it again in thi...
344,577922,Tenet,7.191,8744,Released,2020-08-22,365304105,150,False,/yY76zq9XSuJ4nWyPDuwkdV7Wt0c.jpg,...,Armed with only one word - Tenet - and fightin...,44.025,/aCIFMriQh8rvhxpN1IWGgvH0Tlg.jpg,Time runs out.,"Action, Thriller, Science Fiction","Warner Bros. Pictures, Syncopy","United Kingdom, United States of America","English, Estonian","assassin, espionage, spy, time travel, mumbai ...",tt6723592 Armed with only one word - Tenet - a...
