In [10]:
import polars as pl
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-MiniLM-L6-v2")

df = pl.read_parquet("gs://rag-open-data/imdb/standard/imdb_reviews.parquet")

In [11]:
display(df)

imdb_id,review_title,review_rating,review,title,rating,genre,year
str,str,i64,str,str,f64,str,i64
"""tt0369610""","""Spielberg Magic, This Is Not. …",7,"""You may have heard some critic…","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015
"""tt0369610""","""Not a patch on the original Ju…",7,"""The original Jurassic Park sti…","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015
"""tt0369610""","""Its a Jurassic World after all…",7,"""The 4th film in the Jurassic P…","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015
"""tt0369610""","""Worthy Sequel To One Of The Gr…",,"""Let's start this by stating ho…","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015
"""tt0369610""","""Manages to somewhat return the…",6,"""Modernized and polished entry …","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015
…,…,…,…,…,…,…,…
"""tt27403986""","""About a man""",9,"""Superb casting with Nicholas H…","""Juror #2""",7.0,"""Crime, Drama, Mystery""",2024
"""tt27403986""","""Not quite a unanimous verdict""",6,"""This movie had the potential t…","""Juror #2""",7.0,"""Crime, Drama, Mystery""",2024
"""tt27403986""","""A house built on sand""",5,"""I doubt that this case would e…","""Juror #2""",7.0,"""Crime, Drama, Mystery""",2024
"""tt27403986""","""He's just a regular guy.""",6,"""So decidedly un-Eastwood I was…","""Juror #2""",7.0,"""Crime, Drama, Mystery""",2024


In [4]:
# Use local memory for the Qdrant client
client = QdrantClient(url="http://34.87.227.185:6333")

In [13]:
client.create_collection(
    collection_name="imdb_reviews",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model, should be the ouput of the embedding encoder
        distance=models.Distance.COSINE,
    ),
)

True

In [9]:
df.to_dicts()

[{'index': 0,
  'imdb_id': 'tt0369610',
  'review_title': 'Spielberg Magic, This Is Not. Still, a Visit to Jurassic World Is Worth the Price of Admission.',
  'review_rating': 7,
  'review': 'You may have heard some critics champion Jurassic World as "The best Jurassic Park sequel", some fans declare that it "brought them back to their childhood", and others who may have made the absurd claim, "It\'s better than the original". Don\'t believe the hype. Jurassic World is nowhere close to the best Jurassic Park sequel (Spielberg\'s own, The Lost World: Jurassic Park, will always have that title). It is not going to bring you back to your childhood, and it doesn\'t hold a candle to what Steven Spielberg and crew accomplished with the original Jurassic Park. That being said, in a time of dark, self-serious, and pretentious blockbusters such as last year\'s Dawn of the Planet of the Apes or Christopher Nolan\'s Dark Knight films, Jurassic World is a refreshing antidote. Light, wholesome, and

In [22]:
display(
    df.with_columns(
        pl.lit(encoder.encode(df["review"].to_list())).alias("vector")
    )
)

imdb_id,review_title,review_rating,review,title,rating,genre,year,vector
str,str,i64,str,str,f64,str,i64,"array[f32, 384]"
"""tt0369610""","""Spielberg Magic, This Is Not. …",7,"""You may have heard some critic…","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015,"[-0.044333, -0.054567, … 0.023825]"
"""tt0369610""","""Not a patch on the original Ju…",7,"""The original Jurassic Park sti…","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015,"[-0.048426, -0.0501, … -0.008791]"
"""tt0369610""","""Its a Jurassic World after all…",7,"""The 4th film in the Jurassic P…","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015,"[-0.042069, -0.049041, … -0.009198]"
"""tt0369610""","""Worthy Sequel To One Of The Gr…",,"""Let's start this by stating ho…","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015,"[-0.057659, -0.071513, … -0.004398]"
"""tt0369610""","""Manages to somewhat return the…",6,"""Modernized and polished entry …","""Jurassic World""",6.9,"""Action, Adventure, Sci-Fi""",2015,"[-0.04451, -0.017653, … -0.00966]"
…,…,…,…,…,…,…,…,…
"""tt27403986""","""About a man""",9,"""Superb casting with Nicholas H…","""Juror #2""",7.0,"""Crime, Drama, Mystery""",2024,"[-0.083481, -0.010166, … -0.032365]"
"""tt27403986""","""Not quite a unanimous verdict""",6,"""This movie had the potential t…","""Juror #2""",7.0,"""Crime, Drama, Mystery""",2024,"[-0.030578, 0.00149, … -0.051971]"
"""tt27403986""","""A house built on sand""",5,"""I doubt that this case would e…","""Juror #2""",7.0,"""Crime, Drama, Mystery""",2024,"[-0.037786, 0.086795, … -0.002657]"
"""tt27403986""","""He's just a regular guy.""",6,"""So decidedly un-Eastwood I was…","""Juror #2""",7.0,"""Crime, Drama, Mystery""",2024,"[-0.059246, -0.050208, … -0.03485]"


In [23]:
df.columns

['imdb_id',
 'review_title',
 'review_rating',
 'review',
 'title',
 'rating',
 'genre',
 'year']

In [37]:
df[['imdb_id',
 'review_title']].to_dicts()

[{'imdb_id': 'tt0369610',
  'review_title': 'Spielberg Magic, This Is Not. Still, a Visit to Jurassic World Is Worth the Price of Admission.'},
 {'imdb_id': 'tt0369610',
  'review_title': 'Not a patch on the original Jurassic Park, and is an uneven film, but by far the best of the sequels'},
 {'imdb_id': 'tt0369610', 'review_title': 'Its a Jurassic World after all.'},
 {'imdb_id': 'tt0369610',
  'review_title': 'Worthy Sequel To One Of The Greatest Films Ever Made'},
 {'imdb_id': 'tt0369610',
  'review_title': "Manages to somewhat return the Jurassic Park series to it's former glory, but its still a few notches below Spielberg's first two entries in the series."},
 {'imdb_id': 'tt0369610',
  'review_title': 'Very good but more a remake of the first two movies than a sequel to the series.'},
 {'imdb_id': 'tt0369610',
  'review_title': "The good news: It's vast improvement over Jurassic Park 3. The bad news: it's the most predictable entry in the series"},
 {'imdb_id': 'tt0369610',
  're

In [None]:
encoder.encode(df["review"].to_list())

KeyboardInterrupt: 

In [15]:
client.upload_points(
    collection_name="imdb_reviews",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["review"]).tolist(),
            payload=doc
        )
        for idx, doc in enumerate(df.to_dicts())
    ],
)

In [14]:
client.upsert(
    collection_name="imdb_reviews",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["review"]).tolist(),
            payload=doc
        )
        for idx, doc in enumerate(df.to_dicts())
    ]
)

ResponseHandlingException: timed out

In [16]:
hits = client.query_points(
    collection_name="imdb_reviews",
    query=encoder.encode("What users think about the film Jurassic Park ").tolist(),
    limit=5,
).points

for hit in hits:
    print(hit.payload, "score:", hit.score)

{'imdb_id': 'tt0369610', 'review_title': "If you're looking to sit on the edge of your seat for two hours, welcome!", 'review_rating': 10, 'review': "First of all I think people need to understand that film is a lot like music, you can't listen to a new Justin Bieber song and complain because it's not a Mozart masterpiece. And you definitely can't watch Jurassic World expecting a mind-boggling, emotional Inception type film. I would put Jurassic World in the same category as most Michael Bay films, the story line won't make you think, the characters won't go through any psychological changes but you will be thoroughly entertained by the special effects. I think it's fairly obvious throughout the original trilogy and this film that we are the visitors to the attractions, Steven Spielberg gave the parallel Earth a new unimaginable attraction, one which surprised the audience so much, it's rated as one of the best films ever. Period. Fast forward twenty two years past the films of Interst