# 1. Baseline

In [1]:
import os
import pandas as pd

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from openai import OpenAI

client = OpenAI()

## Minisearch-Text_search

In [21]:
df = pd.read_csv('../data/movie_dataset.csv')
print('Shape:', df.shape)
df.head(2)

Shape: (122397, 6)


Unnamed: 0,id,title,year,plot,genres,director
0,0355ed8c,Patton Oswalt: Annihilation,2017,"Patton Oswald, despite a personal tragedy, pro...",uncategorized,Bobcat Goldthwait
1,4c754ecc,New York Doll,2005,A recovering alcoholic and recently converted ...,"documentary, music",Greg Whiteley


In [23]:
df = df[['title', 'year', 'plot', 'genres', 'director']]
documents = df.to_dict(orient='records')

In [10]:
## Run to download the Minsearch

# import requests

# url = "https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py"
# response = requests.get(url)

# # Сохранение файла
# with open("minsearch.py", "wb") as f:
#     f.write(response.content)

In [11]:
import minsearch

index = minsearch.Index(
    text_fields=['title', 'year', 'plot', 'genres', 'director'],
    keyword_fields=[]
)

index.fit(documents)

<minsearch.Index at 0x703bbc0897c0>

In [12]:
def search(query:str) -> list:
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results


def build_prompt(query:str, search_results:list) -> str:
    prompt_template = """
You are a professional assistant in selecting movies.
Your task is to recommend a movie from our movie dataset that best matches the request or description provided by user. 
Without any preamble, provide information about the movie that best matches the QUESTION based on the provided CONTEXT.

QUESTION: {question}

CONTEXT:
{context}
""".strip()
    
    entry_template = """
title : {title}
plot : {plot}
genres : {genres}
director : {director}
year : {year}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

def llm(prompt:str):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer


In [13]:
documents[150]

{'title': 'Generation Iron 2',
 'year': '2017',
 'plot': 'From the director of Generation Iron, comes the anticipated sequel that will depict 5 of the top bodybuilding and fitness mega-stars on a quest of achieving the ultimate physique and taking it to the next extreme level. In the world of social media and internet, the rules have changed as to what makes an iconic bodybuilding mass-monster. Starring Kai Greene, Calum Von Moger, Rich Piana, among others, this film will explore an all new generation of bodybuilders and how this new world, and new people, carve their own path to physique perfection.',
 'genres': 'documentary',
 'director': 'Vlad Yudin'}

In [14]:
query = "What is the main goal of the bodybuilders depicted in this film?"

In [15]:
i = 1
for doc in search(query):
    print(f"{i}: {doc['title']}\n")
    i+=1

1: The Goal

2: The Scar of Shame

3: Goal

4: The Final Goal

5: This Side of the Law



In [21]:
answer = rag(query)
print(answer)

The main goal of the bodybuilders depicted in the film "The Goal" is to compete in the Olympics. The film chronicles the lives of two athletes, one pursuing a place in the Olympics, highlighting their dedication and challenges as they strive for greatness in sports.


## Elasticsearch-Text_search

In [16]:
# %pip -q install elasticsearch

Running Elasticsearch:

```
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [17]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title" : {"type": "text"},
            "plot" : {"type": "text"},
            "genres" : {"type": "text"},
            "director" : {"type": "text"},
            "year" : {"type": "text"},
        }
    }
}

index_name = "movie-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
# df = pd.read_csv('../data/movie_dataset.csv')
# df = df[['title', 'year', 'plot', 'genres', 'director']]
# documents = df.to_dict(orient='records')

In [31]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 122397/122397 [55:08<00:00, 36.99it/s]


In [52]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["title", "plot", "genres", "director"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [65]:
documents[150]

{'title': 'Generation Iron 2',
 'year': '2017',
 'plot': 'From the director of Generation Iron, comes the anticipated sequel that will depict 5 of the top bodybuilding and fitness mega-stars on a quest of achieving the ultimate physique and taking it to the next extreme level. In the world of social media and internet, the rules have changed as to what makes an iconic bodybuilding mass-monster. Starring Kai Greene, Calum Von Moger, Rich Piana, among others, this film will explore an all new generation of bodybuilders and how this new world, and new people, carve their own path to physique perfection.',
 'genres': 'documentary',
 'director': 'Vlad Yudin'}

In [66]:
query =  'What is the main goal of the bodybuilders depicted in this film?'

In [85]:
i = 1
for doc in elastic_search(query):
    print(f"{i}: {doc['title']}\n")
    i+=1

1: Generation Iron 2

2: Manja

3: The Scar of Shame

4: Welcome to Hollywood

5: Jupiter Ascending



In [86]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [87]:
answer = rag(query)

In [88]:
print(answer)

**Title:** Generation Iron 2  
**Plot:** From the director of Generation Iron, comes the anticipated sequel that will depict 5 of the top bodybuilding and fitness mega-stars on a quest of achieving the ultimate physique and taking it to the next extreme level. In the world of social media and internet, the rules have changed as to what makes an iconic bodybuilding mass-monster. Starring Kai Greene, Calum Von Moger, Rich Piana, among others, this film will explore an all new generation of bodybuilders and how this new world and new people carve their own path to physique perfection.  
**Genres:** Documentary  
**Director:** Vlad Yudin  
**Year:** 2017  

**Main Goal of Bodybuilders:** The main goal of the bodybuilders depicted in this film is to achieve the ultimate physique and take their fitness to the next extreme level, adapting to the changes brought about by social media and the evolving standards within the bodybuilding community.


## Elasticsearch-Vector_search

In [3]:
from sentence_transformers import SentenceTransformer

In [4]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [5]:
df = pd.read_csv('../data/movie_dataset.csv')
print('Shape:', df.shape)
df.head(2)

Shape: (10000, 7)


Unnamed: 0,id,title,year,plot,genres,director,vector
0,5d47d939,Infini,2015,"In the early 23rd century, an emergency is dec...",sci-fi,Shane Abbess,[-1.86556913e-02 2.52590925e-02 -2.45726835e-...
1,d5939e3d,Night at the Museum: Battle of the Smithsonian,2009,"Larry Daley, former night guard at the America...","adventure, family",Shawn Levy,[-3.17084193e-02 7.85600841e-02 -2.09455583e-...


In [31]:
# df['vector'] = df.progress_apply(lambda row: model.encode(f"Title: {row['title']}\nPlot: {row['plot']}\nGenres: {row['genres']}\nDirector: {row['director']}"), axis = 1)

# df.to_csv('../data/movie_dataset.csv', index=False)

100%|██████████| 122397/122397 [1:54:07<00:00, 17.88it/s] 


In [43]:
df['len_plot'] = df['plot'].progress_apply(lambda i: len(i))


100%|██████████| 122397/122397 [00:00<00:00, 1039067.96it/s]


In [9]:
df = df.iloc[-5000:]

In [49]:
df = df.sort_values(by = 'len_plot')
df = df.iloc[-10001:]

In [52]:
df.reset_index(drop = True, inplace=True)

In [10]:
df = df[['id', 'title', 'year', 'plot', 'genres', 'director', 'vector']]
df.to_csv('../data/movie_dataset.csv', index=False)

In [40]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title" : {"type": "text"},
            "plot" : {"type": "text"},
            "genres" : {"type": "text"},
            "director" : {"type": "text"},
            "year" : {"type": "text"},
            "id": {"type": "keyword"},
            "vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "movie-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: ProtocolError(('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))))

In [38]:
es_client.info()

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x703b83b97770>: Failed to establish a new connection: [Errno 111] Connection refused))

In [35]:
documents = df.to_dict(orient='records')

In [36]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  6%|▌         | 7497/122397 [03:40<56:12, 34.07it/s]  


ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: ProtocolError(('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))))

In [None]:
def elastic_search_knn(vector):
    knn = {
        "field": 'vector',
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["title", "plot", "genres", "director", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
documents[150]

In [None]:

query =  'What is the main goal of the bodybuilders depicted in this film?'
query_vector = model.encode(query)

In [None]:
i = 1
for doc in elastic_search_knn(query_vector):
    print(f"{i}: {doc['title']}\n")
    i+=1