In [1]:
import pandas as pd
import numpy as np

import os
from dotenv import load_dotenv
import re

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from openai import OpenAI

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI()

# 1. RAG-flow

The RAG-flow will include:
1) Retrieval :: 3 differen approaches:
    - minisearch
    - elastic-search
    - elastic-search vector
2) LLM :: "gpt-4o-mini"

In [5]:
def build_prompt(query:str, search_results:list) -> str:
    prompt_template = """
You are a professional assistant in selecting movies.
Your task is to recommend a movie from our movie dataset that best matches the request or description provided by user. 
Without any preamble, provide information about the movie that best matches the QUESTION based on the provided CONTEXT.

QUESTION: {question}

CONTEXT:
{context}
""".strip()
    
    entry_template = """
title : {title}
plot : {plot}
genres : {genres}
director : {director}
year : {year}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

def llm(prompt:str):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

## Minisearch-Text_search

In [6]:
df = pd.read_csv('../data/movie_dataset.csv')
print('Shape:', df.shape)
df.head(2)

Shape: (5000, 7)


Unnamed: 0,id,title,year,plot,genres,director,vector
0,4dac245a,I Love Maria,1988,"On one evening, Hong Kong's largest gang, the ...",sci-fi / comedy,David Chung,[-7.36143067e-02 3.23128849e-02 2.23438442e-...
1,6667e983,Girl in Progress,2012,Grace is a single mom raising her fourteen-yea...,comedy-drama,Patricia Riggen,[ 3.08281817e-02 -6.33204207e-02 2.21934766e-...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5000 non-null   object
 1   title     5000 non-null   object
 2   year      5000 non-null   int64 
 3   plot      5000 non-null   object
 4   genres    5000 non-null   object
 5   director  5000 non-null   object
 6   vector    5000 non-null   object
dtypes: int64(1), object(6)
memory usage: 273.6+ KB


In [8]:
df = df[['id', 'title', 'year', 'plot', 'genres', 'director']]
df['year'] = df['year'].apply(lambda i: str(i))

documents = df.to_dict(orient='records')

In [10]:
## Run to download the Minsearch

# import requests

# url = "https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py"
# response = requests.get(url)

# # Сохранение файла
# with open("minsearch.py", "wb") as f:
#     f.write(response.content)

In [9]:
import minsearch

index = minsearch.Index(
    text_fields=['title', 'year', 'plot', 'genres', 'director'],
    keyword_fields=[]
)

index.fit(documents)

<minsearch.Index at 0x715523962a20>

In [10]:
def minisearch(query:str) -> list:
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [11]:
queries = ['What does Ryan Bingham do for a living, and how does it relate to his frequent travels?',
 "How does Ryan's perspective on relationships change throughout the movie?",
 'What challenges does Natalie face regarding her new layoffs program, and how does Ryan respond to it?',
 "What significant events occur during Ryan's sister's wedding that impact his character development?",
 'What realization does Ryan come to about his life and personal philosophies towards the end of the film?']


print('Movie title:', documents[150]['title'], '\n')
i = 1
for query in queries:
    print(f'{i}: {query}')
    for doc in minisearch(query):
        print(f"--{doc['title']}", end = '\t')
    i+=1
    print('\n************')

Movie title: Up in the Air 

1: What does Ryan Bingham do for a living, and how does it relate to his frequent travels?
--Big Top Scooby-Doo!	--It's Kind of a Funny Story	--Turn Left, Turn Right	-- Abby Singer	--The Mad Monk	
************
2: How does Ryan's perspective on relationships change throughout the movie?
--Bachelor Party	--Saving Private Ryan	--The Change-Up	--Jigsaw	-- Abby Singer	
************
3: What challenges does Natalie face regarding her new layoffs program, and how does Ryan respond to it?
--Big Top Scooby-Doo!	--It's Kind of a Funny Story	--Turn Left, Turn Right	-- Abby Singer	--The Mad Monk	
************
4: What significant events occur during Ryan's sister's wedding that impact his character development?
-- Abby Singer	--Gutterballs	--Lost River	--Abominable	--It's Kind of a Funny Story	
************
5: What realization does Ryan come to about his life and personal philosophies towards the end of the film?
--Jigsaw	--Grave Encounters	--Barney's Version	--Up in the

In [12]:
def rag(query):
    search_results = minisearch(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

query = 'What does Ryan Bingham do for a living, and how does it relate to his frequent travels?'

answer = rag(query)
print(answer)

In "Up in the Air," Ryan Bingham, played by George Clooney, is a corporate downsizer who travels extensively for work. His job involves flying across the country to terminate employees on behalf of various companies, which leads to his frequent travels as he seeks to maintain a life disconnected from personal relationships. Bingham embraces this lifestyle, finding solace in his nomadic existence and the comfort of airport lounges, but he ultimately faces the emotional repercussions of his detached way of living. The film explores themes of isolation, commitment, and the impact of modern corporate culture on personal life.


## Elasticsearch-Text_search

In [16]:
# %pip -q install elasticsearch

Running Elasticsearch:

```
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [13]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title" : {"type": "text"},
            "plot" : {"type": "text"},
            "genres" : {"type": "text"},
            "director" : {"type": "text"},
            "year" : {"type": "text"},
        }
    }
}

index_name = "movie-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movie-questions'})

In [17]:
df = pd.read_csv('../data/movie_dataset.csv')
df = df[['title', 'year', 'plot', 'genres', 'director']]
df['year'] = df['year'].apply(lambda i: str(i))

documents = df.to_dict(orient='records')

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 5000/5000 [01:35<00:00, 52.11it/s]


In [23]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["title", "plot", "genres", "director"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        hit_dict = hit['_source']
        hit_dict['_score'] = hit['_score']
        result_docs.append(hit_dict)

    return result_docs

In [19]:
queries = ['What does Ryan Bingham do for a living, and how does it relate to his frequent travels?',
           "How does Ryan's perspective on relationships change throughout the movie?",
           'What challenges does Natalie face regarding her new layoffs program, and how does Ryan respond to it?',
           "What significant events occur during Ryan's sister's wedding that impact his character development?",
           'What realization does Ryan come to about his life and personal philosophies towards the end of the film?']


print('Movie title:', documents[150]['title'], '\n')
i = 1
for query in queries:
    print(f'{i}: {query}')
    for doc in elastic_search(query):
        print(f"--{doc['title']}", end = '\t')
    i+=1
    print('\n************')

Movie title: Up in the Air 

1: What does Ryan Bingham do for a living, and how does it relate to his frequent travels?
--Up in the Air	--Forger, TheThe Forger	--State of Play	--God's Not Dead 2	--Judaai	
************
2: How does Ryan's perspective on relationships change throughout the movie?
--Up in the Air	--Anbulla Rajinikanth	--Carnal Knowledge	--Donald in Mathmagic Land	--Kala Bazar	
************
3: What challenges does Natalie face regarding her new layoffs program, and how does Ryan respond to it?
--Up in the Air	--Billy's Hollywood Screen Kiss	--What We Do Is Secret	--Love Actually	--Drumline	
************
4: What significant events occur during Ryan's sister's wedding that impact his character development?
--Up in the Air	--Bad Sleep Well !The Bad Sleep Well	--The Ugly American	--Lone Survivor	--Kanchivaram	
************
5: What realization does Ryan come to about his life and personal philosophies towards the end of the film?
--Up in the Air	--Why Has Bodhi-Dharma Left for t

In [24]:
query = 'What does Ryan Bingham do for a living, and how does it relate to his frequent travels?'
for doc in elastic_search(query):
    print(doc)

{'title': 'Up in the Air', 'year': '2009', 'plot': 'Ryan Bingham works for a Human Resources consultancy firm which specializes in termination assistance, and makes his living traveling to workplaces across the United States, conducting company layoffs and firings on behalf of employers. He also gives motivational speeches, using the analogy "What\'s in Your Backpack?" to extol the virtues of a life free of burdensome relationships with people as well as things. A frequent flyer, Ryan has no fixed abode, relishes his travels, and desires to become the seventh and youngest person to earn ten million frequent flyer miles with American Airlines. During his travels, he meets another frequent flyer named Alex, and they begin a casual relationship.\nRyan is called back to his company\'s offices in Omaha, Nebraska. Natalie Keener, a young and ambitious new hire, promotes a program designed to cut costs by conducting layoffs via videoconferencing. Ryan raises concerns that the program could be

In [25]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

query = 'What does Ryan Bingham do for a living, and how does it relate to his frequent travels?'

answer = rag(query)
print(answer)

**Title**: Up in the Air  
**Plot**: Ryan Bingham works for a Human Resources consultancy firm specializing in termination assistance, making his living traveling to workplaces across the United States to conduct layoffs and firings on behalf of employers. A frequent flyer, Ryan has no fixed abode and enjoys his travels, aiming to become the youngest person to earn ten million frequent flyer miles. During his journeys, he navigates the complexities of personal relationships and his evolving perspectives on life and love.  
**Genres**: Comedy-Drama  
**Director**: Jason Reitman  
**Year**: 2009  


## Elasticsearch-Vector_search

In [26]:
from sentence_transformers import SentenceTransformer

In [27]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [None]:
# df['vector'] = df.progress_apply(lambda row: model.encode(f"Title: {row['title']}\nPlot: {row['plot']}\nGenres: {row['genres']}\nDirector: {row['director']}"), axis = 1)

# df.to_csv('../data/movie_dataset.csv', index=False)

In [28]:
df = pd.read_csv('../data/movie_dataset.csv')
print('Shape:', df.shape)
df.head(2)

Shape: (5000, 7)


Unnamed: 0,id,title,year,plot,genres,director,vector
0,4dac245a,I Love Maria,1988,"On one evening, Hong Kong's largest gang, the ...",sci-fi / comedy,David Chung,[-7.36143067e-02 3.23128849e-02 2.23438442e-...
1,6667e983,Girl in Progress,2012,Grace is a single mom raising her fourteen-yea...,comedy-drama,Patricia Riggen,[ 3.08281817e-02 -6.33204207e-02 2.21934766e-...


In [29]:
import ast

def str_to_vector(s:str):
    '''This functions converts str(vectror) to np.array(vector) type.'''
    
    s = s.replace('\n', ' ').strip()
    s = re.sub(r'(?<=\d)\s+(?=-?\d)', ',', s)
    
    try:
        return np.array(ast.literal_eval(s))
    except (SyntaxError, ValueError) as e:
        print(f"Error converting string to vector: {e}")
        return None


df['vector'] = df['vector'].progress_apply(str_to_vector)
df['year'] = df['year'].apply(lambda i: str(i))

  1%|▏         | 74/5000 [00:00<00:06, 739.26it/s]

100%|██████████| 5000/5000 [00:07<00:00, 674.44it/s]


In [30]:
from elasticsearch import Elasticsearch

es_client_knn = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title" : {"type": "text"},
            "plot" : {"type": "text"},
            "genres" : {"type": "text"},
            "director" : {"type": "text"},
            "year" : {"type": "text"},
            "id": {"type": "keyword"},
            "vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "movie-questions"

es_client_knn.indices.delete(index=index_name, ignore_unavailable=True)
es_client_knn.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movie-questions'})

In [31]:
documents = df.to_dict(orient='records')

for doc in tqdm(documents):
    es_client_knn.index(index=index_name, document=doc)

  0%|          | 0/5000 [00:00<?, ?it/s]

100%|██████████| 5000/5000 [01:40<00:00, 49.94it/s]


In [32]:
def elastic_search_knn(query_vector):
    knn = {
        "field": 'vector',
        "query_vector": query_vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["title", "plot", "genres", "director", "id", "year"]
    }

    es_results = es_client_knn.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        hit_dict = hit['_source']
        hit_dict['_score'] = hit['_score']
        result_docs.append(hit_dict)

    return result_docs

In [33]:
queries = ['What does Ryan Bingham do for a living, and how does it relate to his frequent travels?',
           "How does Ryan's perspective on relationships change throughout the movie?",
           'What challenges does Natalie face regarding her new layoffs program, and how does Ryan respond to it?',
           "What significant events occur during Ryan's sister's wedding that impact his character development?",
           'What realization does Ryan come to about his life and personal philosophies towards the end of the film?']


print('Movie title:', documents[150]['title'], '\n')
i = 1
for query in queries:
    print(f'{i}: {query}')
    for doc in elastic_search_knn(model.encode(query)):
        print(f"--{doc['title']}", end = '\t')
    i+=1
    print('\n************')

Movie title: Up in the Air 

1: What does Ryan Bingham do for a living, and how does it relate to his frequent travels?
--Up in the Air	--Nebraska	--Autumn in New York	--Saving Private Ryan	--The Best Years of Our Lives	
************
2: How does Ryan's perspective on relationships change throughout the movie?
-- Keeping the Faith	--Ruby Sparks	--Up in the Air	--Life as We Know It	--No Looking Back	
************
3: What challenges does Natalie face regarding her new layoffs program, and how does Ryan respond to it?
--Up in the Air	--The Debt	--The Guilt Trip	--Employee of the Month	--Stick It	
************
4: What significant events occur during Ryan's sister's wedding that impact his character development?
--Four Sisters and a Wedding	--Hannah and Her Sisters	--Bridesmaids	--Father's Little Dividend	--The Second Woman	
************
5: What realization does Ryan come to about his life and personal philosophies towards the end of the film?
--Saving Private Ryan	--Up in the Air	--The Livi

In [34]:
def rag(query):
    search_results = elastic_search_knn(model.encode(query))
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

query = 'What does Ryan Bingham do for a living, and how does it relate to his frequent travels?'

answer = rag(query)
print(answer)

**Title:** Up in the Air  
**Plot:** Ryan Bingham works for a Human Resources consultancy firm which specializes in termination assistance, and makes his living traveling to workplaces across the United States, conducting company layoffs and firings on behalf of employers. He is a frequent flyer, relishing his travels and aiming to become the youngest person to earn ten million frequent flyer miles with American Airlines. Throughout the film, Ryan’s job requires him to travel constantly, which he embraces as part of his lifestyle until he begins to question his philosophies on relationships and life's burdens during his journeys.  
**Genres:** Comedy-drama  
**Director:** Jason Reitman  
**Year:** 2009  


# 2. Retrieval evaluating

I will evaluate 3 differen retrievals:
- minisearch
- elastic-search
- elastic-search vector

In [35]:
df_questions = pd.read_csv('../data/questions_ground_truth.csv')
ground_truth = df_questions.to_dict(orient='records')
ground_truth[0]

{'id': 'd4864332',
 'question': 'What happens to Ella after she is cursed with the gift of obedience by the fairy Lucinda?'}

In [36]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [37]:
search_funtions = {'minisearch':minisearch,
                   'elastic_search':elastic_search,
                   'elastic_search_knn':elastic_search_knn}

for n, f in search_funtions.items():
    print(n, f)


minisearch <function minisearch at 0x7155232ae2a0>
elastic_search <function elastic_search at 0x71551bd77ba0>
elastic_search_knn <function elastic_search_knn at 0x715554280220>


In [39]:
df_retrieval_eval = pd.DataFrame(columns=['search_function', 'hit_rate', 'mrr'])

for name, func in search_funtions.items():
    if name != 'elastic_search_knn':
        eval_res = evaluate(ground_truth, lambda q: func(q['question']))
    else:
        eval_res = evaluate(ground_truth, lambda q: func(model.encode(q['question'])))
    
    df_retrieval_eval.loc[len(df_retrieval_eval)] = {'search_function': name,
                                                     'hit_rate': eval_res['hit_rate'],
                                                     'mrr':eval_res['mrr']}

100%|██████████| 250/250 [00:08<00:00, 30.30it/s]
100%|██████████| 250/250 [00:01<00:00, 126.39it/s]
100%|██████████| 250/250 [00:08<00:00, 31.04it/s]


In [40]:
df_retrieval_eval

Unnamed: 0,search_function,hit_rate,mrr
0,minisearch,0.392,0.274533
1,elastic_search,0.884,0.8368
2,elastic_search_knn,0.636,0.5404


## Next