<a href="https://colab.research.google.com/github/SrikanthArgp/colab_practices/blob/main/Movies_Vec_DB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import pandas as pd
import seaborn as sns
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from pprint import pprint

In [None]:
#!pip install langchain

In [None]:
#!pip install chromadb

In [None]:
def max_word_count(txt_list:list):
    max_length = 0
    for txt in txt_list:
        word_count = len(re.findall(r'\w+', txt))
        if word_count > max_length:
            max_length = word_count
    return f"Max Word Count: {max_length} words"

In [None]:
model_max_chunk_length = 256
token_splitter = SentenceTransformersTokenTextSplitter(
    tokens_per_chunk=model_max_chunk_length,
    model_name="all-MiniLM-L6-v2",
    chunk_overlap=0
)

In [None]:
!pip install sentence-transformers

In [None]:
text_path = "/content/drive/MyDrive/AppliedNLPMaterial-master/data/movies.csv"
df_movies_raw = pd.read_csv(text_path, parse_dates=['release_date'])

df_movies_raw.head(2)

In [None]:
selected_cols = ['id', 'title', 'overview', 'vote_average', 'release_date']
df_movies_filt = df_movies_raw[selected_cols].dropna()

In [None]:
df_movies_filt = df_movies_filt.drop_duplicates(subset=['id'])

In [None]:
df_movies_filt = df_movies_filt[df_movies_filt['release_date'] > '2023-01-01']
df_movies_filt.shape

In [None]:
max_word_count(df_movies_filt['overview'])

In [None]:
descriptions_len = []
for txt in df_movies_filt.loc[:, "overview"]:
    descriptions_len.append(len(re.findall(r'\w+', txt)))

In [None]:
sns.histplot(descriptions_len, bins=100)

In [None]:
embedding_fn = SentenceTransformerEmbeddingFunction()

In [None]:
chroma_db = chromadb.PersistentClient(path="/content/drive/MyDrive/AppliedNLPMaterial-master/040_VectorDatabase/db")

In [None]:
chroma_db.list_collections()

In [None]:
chroma_collection = chroma_db.get_or_create_collection("movies")

In [None]:
ids = [str(i) for i in df_movies_filt['id'].tolist()]
documents = df_movies_filt['overview'].tolist()
titles = df_movies_filt['title'].tolist()
metadatas = [{'source': title} for title in titles]

In [None]:
chroma_collection.add(documents=documents, ids=ids, metadatas=metadatas)

In [None]:
len(chroma_collection.get()['ids'])

In [None]:
def get_title_by_description(query_text:str):
    n_results = 3
    res = chroma_collection.query(query_texts=[query_text], n_results=n_results)
    for i in range(n_results):
        pprint(f"Title: {res['metadatas'][0][i]['source']} \n")
        pprint(f"Description: {res['documents'][0][i]} \n")
        pprint("-------------------------------------------------")

In [None]:
get_title_by_description(query_text="monster, underwater")

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
print("Hello dot env", os.getenv("OPENAI_API_KEY"))

In [None]:
#!pip list | grep dotenv

In [None]:
import openai
from openai import OpenAI
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
!pip install openai

In [None]:
chroma_db = chromadb.PersistentClient(path="/content/drive/MyDrive/AppliedNLPMaterial-master/040_VectorDatabase/db")

In [None]:
chroma_db.list_collections()

In [None]:
chroma_collection = chroma_db.get_or_create_collection("movies")

In [None]:
res = chroma_collection.query(query_texts=["a monster in closet"], n_results=5)

In [None]:
len(chroma_collection.get()['ids'])

In [None]:
def get_query_results(query_text:str, n_results:int=5):
    res = chroma_collection.query(query_texts=[query_text], n_results=n_results)
    for i in range(n_results):
        pprint(f"Title: {res['metadatas'][0][i]['source']} \n")
        pprint(f"Description: {res['documents'][0][i]} \n")
        pprint("-------------------------------------------------")
        res_str = ''.join(f"Title: {res['metadatas'][0][i]['source']} \n Description: {res['documents'][0][i]} \n " )
    return res_str
query_text = "a monster in the closet"
#get_query_results(query_text)
retrieved_results = get_query_results(query_text)

In [None]:
print(res)

In [None]:
def get_query_results(query_text:str, n_results:int=5):
    res = chroma_collection.query(query_texts=[query_text], n_results=n_results)
    docs = res["documents"][0]
    titles = [item['source'] for item in res["metadatas"][0]]
    res_string = ';'.join([f'{title}: {description}' for title, description in zip(titles, docs)])
    return res_string

query_text = "a monster in the closet"
retrieved_results = get_query_results(query_text)


In [None]:
res["metadatas"][0]

In [None]:
retrieved_results

In [None]:
system_role_definition = "You are a an expert in movies. Users will ask you questions about movies. You will get a user question, and relevant information. Relevant information is structured like movie title:movie plot; ... Please summarize the information provided in 10 words."
user_query = "What are the names of the movies and their plot where {user_query}?"
messages = [
    {"role": "system",
     "content": system_role_definition
     },
    {"role": "user",
     "content": f"{user_query}; \n Information: {retrieved_results}"
     }
]

In [None]:
openai_client = OpenAI()
model="gpt-3.5-turbo"
response = openai_client.chat.completions.create(
    model=model,
    messages=messages
)

In [None]:
content = response.choices[0].message.content

In [None]:
content

In [None]:
def rag(user_query:str):
    retrieved_results = get_query_results(user_query)
    system_role_definition = "You are a an expert in movies. Users will ask you questions about movies. You will get a user question, and relevant information. Relevant information is structured like movie title:movie plot; ... Please summarize answer in 10 words for each movie in question."
    user_query_complete = f"What are the names of the movies and their plot where {user_query}?"
    messages = [
        {"role": "system",
        "content": system_role_definition
        },
        {"role": "user",
        "content": f"{user_query_complete}; \n Information: {retrieved_results}"
        }
    ]
    openai_client = OpenAI()
    model="gpt-3.5-turbo"
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages
    )
    content = response.choices[0].message.content
    return content

In [None]:
print("Response from Vector DB")
print("-------------------------------------------------")
query = "a cop is chasing a criminal"
pprint(get_query_results(query))

print("Response from RAG")
print("-------------------------------------------------")
pprint(rag(query))