In [None]:
pip install datasets



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')
plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from datasets import load_dataset
ds = load_dataset("hugginglearners/netflix-shows")

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description'],
        num_rows: 8807
    })
})

In [None]:
df = ds["train"].to_pandas()

In [None]:
df2 = pd.DataFrame(columns=['title','director','cast','release_year','rating','description'])

In [None]:
df2 = df[['title','director','cast','release_year','rating','description']]

In [None]:
df2

Unnamed: 0,title,director,cast,release_year,rating,description
0,Dick Johnson Is Dead,Kirsten Johnson,,2020,PG-13,"As her father nears the end of his life, filmm..."
1,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",2021,TV-MA,"After crossing paths at a party, a Cape Town t..."
2,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",2021,TV-MA,To protect his family from a powerful drug lor...
3,Jailbirds New Orleans,,,2021,TV-MA,"Feuds, flirtations and toilet talk go down amo..."
4,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",2021,TV-MA,In a city of coaching centers known to train I...
...,...,...,...,...,...,...
8802,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",2007,R,"A political cartoonist, a crime reporter and a..."
8803,Zombie Dumb,,,2018,TV-Y7,"While living alone in a spooky town, a young g..."
8804,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",2009,R,Looking to survive in a world taken over by zo...
8805,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",2006,PG,"Dragged from civilian life, a former superhero..."


Preprocessing:

In [None]:
def process_text(text):
    text = re.sub(r'<br />', '', text)
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)

    return " ".join([word for word in text.lower().split() if word not in stopwords])

In [None]:
df2['description'] = df2['description'].apply(process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['description'] = df2['description'].apply(process_text)


In [None]:
df2['title'].unique()

array(['Dick Johnson Is Dead', 'Blood & Water', 'Ganglands', ...,
       'Zombieland', 'Zoom', 'Zubaan'], dtype=object)

In [None]:
df2.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop_duplicates(inplace=True)


In [None]:
df2.fillna('Not found', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.fillna('Not found', inplace=True)


In [None]:
df2['Overview'] = 'Title: ' + df2['title'] + ' Description: ' + df2['description'] + ' Director: ' + df2['director'] + ' Cast: ' + df2['cast'] + ' Release year: ' + ' Rating: ' + df2['rating']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Overview'] = 'Title: ' + df2['title'] + ' Description: ' + df2['description'] + ' Director: ' + df2['director'] + ' Cast: ' + df2['cast'] + ' Release year: ' + ' Rating: ' + df2['rating']


Summerization:

In [None]:
from transformers import pipeline
import pandas as pd
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
print(df2.columns)

Index(['title', 'director', 'cast', 'release_year', 'rating', 'description',
       'Overview'],
      dtype='object')


In [None]:
!pip install sentence_transformers



Search Engine:

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)



In [None]:
sentences = df2['Overview'].tolist()
embeddings = model.encode(sentences)

In [None]:
df2['embeddings'] = list(embeddings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['embeddings'] = list(embeddings)


In [None]:
df2.to_pickle('movie_embeddings.pkl')

In [None]:
!pip install faiss-Gpu
import faiss



In [None]:
embeddings =df2['embeddings'].tolist()

In [None]:
d = embeddings.shape[1]

In [None]:
d

768

In [None]:
nlist = 100
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)
index.is_trained

False

In [None]:
index.train(embeddings)
index.is_trained

True

In [None]:
index.add(embeddings)
index.ntotal

8807

In [None]:
k = 10
xq = model.encode(['give me an adam sandler cool movie'])

In [None]:
D, I = index.search(xq, k)

In [None]:
from pprint import pprint
pprint({sentences[i] for i in I[0]})

{'Title: 50 First Dates Description: falling pretty art teacher shortterm '
 'memory marine veterinarian win every single day Director: Peter Segal Cast: '
 'Adam Sandler, Drew Barrymore, Rob Schneider, Sean Astin, Lusia Strus, Dan '
 'Aykroyd, Amy Hill, Allen Covert, Blake Clark, Maya Rudolph Release year:  '
 'Rating: PG-13',
 'Title: Anger Management Description: gentle businessman forced get anger '
 'management counseling therapist moves turns anger issues Director: Peter '
 'Segal Cast: Adam Sandler, Jack Nicholson, Marisa Tomei, Luis Guzmán, Allen '
 'Covert, Lynne Thigpen, Kurt Fuller, Jonathan Loughran, Krista Allen, January '
 'Jones, Woody Harrelson, John Turturro, Kevin Nealon Release year:  Rating: '
 'PG-13',
 'Title: Big Daddy Description: dumped girlfriend refuses accept '
 'responsibility overgrown adolescent sonny koufax adopts 5yearold prove hes '
 'grownup Director: Dennis Dugan Cast: Adam Sandler, Joey Lauren Adams, Jon '
 'Stewart, Cole Sprouse, Dylan Sprouse, Jos

In [None]:
def search(query):
  xq = model.encode([query])
  D, I = index.search(xq, k)
  return [f'{sentences[i]}' for i in I[0]]

In [None]:
def greet(query):
  k = 10
  xq = model.encode([query])
  result = search(query)
  return result

In [None]:
!pip install openai==0.27.7

Collecting openai==0.27.7
  Downloading openai-0.27.7-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.27.7-py3-none-any.whl (71 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/72.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
Successfully installed openai-0.27.7


In [None]:
from google.colab import userdata
import openai
openai.api_key = userdata.get('openai_api_key')

In [None]:
def generate_answer(query):
    prompt = f"""
    Based on the following query from a user, please generate a detailed answer based on the context
    focusing on which is the top movie based on the query. You should responsd as you are a movie recomendation agent and are conversing with the
    user in a nice coordial way. Generate an answer as if you are a poet. Make sure to address the user as Mohammed.
    remove the special characters and (/n ) , make the output clean and concise.

    ###########
    query:
    "{query}"

    ########

    context:"
    "{greet(query)}"
    #####

    Return in Markdown format with each movie highlighted.
    """

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        max_tokens=1500,
        n=1,
        stop=None,
        temperature=0.2, #higher temperature means more creative or more hallucination
        messages = messages


    )

    # Extract the generated response from the API response
    generated_text = response.choices[0].message['content'].strip()

    return generated_text

In [None]:
txt= generate_answer('give me an adam sandler cool movie')

In [None]:
import markdown
from IPython.display import display, HTML

def render_markdown(md_text):
    # Convert Markdown to HTML
    html = markdown.markdown(md_text)
    # Display the HTML
    display(HTML(html))

In [None]:
render_markdown(txt)

In [None]:
!pip install gradio
import gradio as gr

Collecting gradio
  Downloading gradio-4.41.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradi

Gradio:

In [None]:

demo = gr.Interface(fn=greet, inputs="text", outputs="json")


demo.launch(share=True,debug=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://4c7f9ffc769fa1b57c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


