# Rag Lab

The aim of this notebook is to experiment with the complete RAG pipeline.
- Data download
- Data cleaning
- Data indexing: Elasticsearch
- RAG: OpenAI LLM + Elasticsearch

## Data download

In [1]:
import os

from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd

api = KaggleApi()
api.authenticate()
current_dir = os.getcwd()

In [2]:
api.dataset_download_files('datasnaek/youtube-new', path=current_dir + '../../../data', quiet=True, unzip=True)

Dataset URL: https://www.kaggle.com/datasets/datasnaek/youtube-new


## Data cleaning

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('../../data/USvideos.csv')

In [4]:
df.shape

(40949, 16)

In [5]:
df.head(3)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...


In [6]:
df = df[df['views'] > 1000000]
df.shape

(16341, 16)

In [7]:
df.sort_values(['trending_date', 'views', 'likes'], ascending=False, inplace=True)
df.dropna(subset=['title', 'description', 'tags'], inplace=True)
df = df.drop_duplicates(subset=['title']).reset_index(drop=True)
df = df[['title', 'description', 'tags']]

In [8]:
df.shape

(2187, 3)

In [9]:
df.insert(0, 'id', df.index)

In [10]:
df.head(3)

Unnamed: 0,id,title,description,tags
0,0,Childish Gambino - This Is America (Official V...,“This is America” by Childish Gambino http://s...,"Childish Gambino|""Rap""|""This Is America""|""mcDJ..."
1,1,BTS (방탄소년단) 'FAKE LOVE' Official MV,BTS (방탄소년단) 'FAKE LOVE' Official MVDirector : ...,"BIGHIT|""빅히트""|""방탄소년단""|""BTS""|""BANGTAN""|""방탄""|""FAK..."
2,2,Do You Hear Yanny or Laurel? (SOLVED with SCIE...,Yanny vs. Laurel audio illusion solved! PHEW F...,"AsapSCIENCE|""audio illusion""|""yanny""|""laurel""|..."


In [11]:
df.isna().sum()

id             0
title          0
description    0
tags           0
dtype: int64

In [12]:
# Because of processing time and OpenAI Billing, I will take just 200 random samples as the main data
# I use sampling to get more diversity (it seems that the most viewed videos are music videos)
df = df.sample(n=200, random_state=0)

In [13]:
df.to_csv('../../data/data.csv', index=False)

## Data indexing: Elasticsearch

In [14]:
documents = df[['id', 'title', 'tags', 'description']].to_dict('records')

In [15]:
# Run ElasticSearch previously in docker
# docker run -it \
#     --rm \
#     --name elasticsearch \
#     -m 4GB \
#     -p 9200:9200 \
#     -p 9300:9300 \
#     -e "discovery.type=single-node" \
#     -e "xpack.security.enabled=false" \
#     docker.elastic.co/elasticsearch/elasticsearch:8.4.3

from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 
es_client.info()

ObjectApiResponse({'name': '52a926cb09df', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'lGJGYGlNREaNO19N8Hgfzw', 'version': {'number': '8.5.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c94b4700cda13820dad5aa74fae6db185ca5c304', 'build_date': '2022-10-24T16:54:16.433628434Z', 'build_snapshot': False, 'lucene_version': '9.4.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "description": {"type": "text"},
            "tags": {"type": "text"},
            "id": {"type" : "keyword"},
        }
    }
}

index_name = "youtube-titles"

try:
    es_client.indices.create(index=index_name, body=index_settings)
except:
    es_client.options(ignore_status=[400,404]).indices.delete(index=index_name)
    es_client.indices.create(index=index_name, body=index_settings)

In [17]:
from tqdm import tqdm 
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 200/200 [00:09<00:00, 20.05it/s]


In [18]:
query = "easy recipe"

In [19]:
search_query = {
    "size": 10,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["description^4", "text"],
                    "type": "best_fields"
                }
            },
        }
    }
}

search_results = es_client.search(index=index_name,
                                  body=search_query)

[r['_source'] for r in search_results['hits']['hits']]


[{'id': 353,
  'title': 'Real Doctor Reacts to THE GOOD DOCTOR | Medical Drama Review | Doctor Mike',
  'tags': 'doctor mike|"dr mike"|"medical drama review"|"medical tv show"|"real doctor watches greys anatomy"|"tv medical drama"|"medical drama"|"resident"|"medical drama tv"|"TV doctors"|"doctor tv shows"|"doctor reviews medical tv show"|"best doctor tv shows"|"good doctor"|"the good doctor"|"doctor watches the good doctor"|"good doctor accurate"|"the good doctor reaction"|"doctor mike reacts to"|"the good doctor (tv program)"|"the good doctor season 1"|"the good doctor 1x01"|"shaun murphy"|"autism tv"',
  'description': "Wow! Thank you so much for supporting the Real Doctor Reacts to series. I said that if you get the grey's anatomy video to 15k likes I'll do another episode and you CRUSHED it by getting it 200k+ likes. I dove into the comments section to see what medical tv drama you wanted me to review next and it seemed like the good doctor was the obvious choice. Get ready TV Doc

### RAG: OpenAI LLM + Elasticsearch 

In [20]:
from dotenv import load_dotenv
from openai import OpenAI

# Create previously the environment variable OPENAI_API_KEY with your personal OpenAI API key
# in the command line or in a .env file
load_dotenv()

client = OpenAI()

In [21]:
prompt_template = """
You're a professional youtuber. Answer with a youtube video title to the QUERY which is based on the CONTEXT from the video database.
Use only the facts from the CONTEXT when answering the QUERY

QUERY:
{query}

CONTEXT:
{context}
""".strip()

entry_template = """
video_title: {title},
video_description: {description},
video_tags: {tags}
""".strip()


def build_prompt(query, search_results, prompt_template, entry_template):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(id=doc["id"],
                                                  title=doc["title"],
                                                  description=doc["description"],
                                                  tags=doc["tags"]) + "\n\n"
        
    prompt = prompt_template.format(query=query, context=context).strip()
    return prompt


def search(query):

    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["description", "text", "tags"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }

    search_results = es_client.search(index=index_name, body=search_query)
    return [r['_source'] for r in search_results['hits']['hits']]
            
        
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content


def rag(query, model='gpt-4o-mini'):
    search_results = search(query=query)
    prompt = build_prompt(query=query,
                        search_results=search_results,
                        entry_template=entry_template,
                        prompt_template=prompt_template)
    answer = llm(prompt=prompt, model=model)
    return answer

In [22]:
query = 'easy and tasty recipe for spanish omelette'
rag(query)

'"Master the Spanish Omelette: Easy & Delicious Recipe!"'

In [23]:
query = 'video which explains oyasumi punpun manga insights'
rag(query)

'"Manga Insights: Exploring the Depths of Oyasumi Punpun"'