In [1]:
from datasets import load_dataset

# Load the Parquet file from Hugging Face
dataset = load_dataset(
    "parquet",
    data_files={
        "train": "https://huggingface.co/datasets/SathvikVeerapaneni7/CineAI_Dataset/resolve/main/parquet_files/details_df_clean.parquet"
    },
    split="train"
)

# Convert to a Pandas DataFrame
df_details = dataset.to_pandas()
df_details.head()


Unnamed: 0,movie_id,title,overview,release_date,runtime,original_language,popularity,genres_str
0,98.0,Gladiator,"In the year 180, the death of Emperor Marcus A...",2000-05-04,155.0,en,497.649,['Action' 'Drama' 'Adventure']
1,8871.0,How the Grinch Stole Christmas,The Grinch decides to rob Whoville of Christma...,2000-11-17,104.0,en,177.539,['Family' 'Comedy' 'Fantasy']
2,7443.0,Chicken Run,The creators of Wallace & Gromit bring you an ...,2000-06-23,84.0,en,62.638,['Animation' 'Comedy' 'Family']
3,9532.0,Final Destination,After a teenager has a terrifying vision of hi...,2000-03-17,98.0,en,54.097,['Horror']
4,77.0,Memento,Leonard Shelby is tracking down the man who ra...,2000-10-11,113.0,en,95.374,['Mystery' 'Thriller']


In [12]:
# Display rows where the 'title' column is null
null_rows = df_details[df_details['title'].isnull()]
print(null_rows)


       movie_id title overview release_date  runtime original_language  \
65873  679782.0  None     None          NaT      NaN              None   

       popularity genres_str  
65873         NaN         []  


In [103]:
df_details['title']

0                              Gladiator
1         How the Grinch Stole Christmas
2                            Chicken Run
3                      Final Destination
4                                Memento
                       ...              
230581                        Precognito
230582             Giddh (The Scavenger)
230583                       Gulaam Chor
230584                       Due battiti
230585            Koromousso, Big Sister
Name: title, Length: 230585, dtype: object

In [17]:
# Find duplicate 'movie_id' including both first and last occurrences
duplicates = df_details[df_details.duplicated('movie_id', keep=False)]
print(duplicates)


        movie_id       title  \
65873   679782.0        None   
186796  679782.0  Foster Boy   

                                                 overview release_date  \
65873                                                None          NaT   
186796  A lawyer finds himself at the center of a tria...   2019-10-03   

        runtime original_language  popularity                      genres_str  
65873       NaN              None         NaN                              []  
186796    109.0                en       3.507  ['Drama' 'Mystery' 'Thriller']  


In [18]:
# Remove rows where the 'title' is None and 'movie_id' is 679782.0
df_details = df_details[~((df_details['movie_id'] == 679782.0) & (df_details['title'].isnull()))]

In [19]:
# Verify the result
print(df_details[df_details['movie_id'] == 679782.0])

        movie_id       title  \
186796  679782.0  Foster Boy   

                                                 overview release_date  \
186796  A lawyer finds himself at the center of a tria...   2019-10-03   

        runtime original_language  popularity                      genres_str  
186796    109.0                en       3.507  ['Drama' 'Mystery' 'Thriller']  


In [21]:
# No nulls

null_rows = df_details[df_details['title'].isnull()]
print(null_rows)


Empty DataFrame
Columns: [movie_id, title, overview, release_date, runtime, original_language, popularity, genres_str]
Index: []


In [94]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
points, _ = client.scroll(
    collection_name="movie_collection_test_sample_1",
    limit=1,
    with_payload=True
)

for point in points:
    print(point)


id='00000fb9-5bf1-44d5-a1ed-c844f01de307' payload={'movie_id': 112774.0, 'runtime': 0.0, 'release_date': '2003-01-21', 'combined_text': 'Movie name is Black Listed. Its description is Alan Chambers (Townsend) is a lawyer and sick of his job. He corrals his friends into forming a vigilante group. Everyone goes along with it at first, until it turns sour and innocent people start dying, and Alan is the only one left standing.. Movie genres are Action, Thriller, Drama', 'movie_id_1': 112774, 'runtime_1': 0} vector=None shard_key=None order_value=None


In [100]:
for point in points:
    print("id:", type(point.id).__name__)
    print("movie_id:", type(point.payload["movie_id"]).__name__)
    print("runtime:", type(point.payload["runtime"]).__name__)
    print("release_date:", type(point.payload["release_date"]).__name__)
    print("combined_text:", type(point.payload["combined_text"]).__name__)
    print("movie_id_1:", type(point.payload["movie_id_1"]).__name__)
    print("runtime:", type(point.payload["runtime_1"]).__name__)
    print("---------------------")


id: str
movie_id: float
runtime: float
release_date: str
combined_text: str
movie_id_1: int
runtime: int
---------------------


In [101]:
for i in points:
    print(i)

id='00000fb9-5bf1-44d5-a1ed-c844f01de307' payload={'movie_id': 112774.0, 'runtime': 0.0, 'release_date': '2003-01-21', 'combined_text': 'Movie name is Black Listed. Its description is Alan Chambers (Townsend) is a lawyer and sick of his job. He corrals his friends into forming a vigilante group. Everyone goes along with it at first, until it turns sour and innocent people start dying, and Alan is the only one left standing.. Movie genres are Action, Thriller, Drama', 'movie_id_1': 112774, 'runtime_1': 0} vector=None shard_key=None order_value=None


Searching Movie Id with 98, qdrant wont work with float, so we need to convert it to int.

In [61]:
# from qdrant_client import QdrantClient
# from qdrant_client.http import models

# client = QdrantClient(host="localhost", port=6333)

# scroll_filter = models.Filter(
#     must=[
#         models.FieldCondition(
#             key="movie_id",
#             match=models.MatchValue(value=112774.0),  # Filtering by 112774.0
#         )
#     ]
# )

# points, _ = client.scroll(
#     collection_name="movie_collection_test_sample_1",
#     limit=100,
#     with_payload=True,
#     scroll_filter=scroll_filter
# )

# for point in points:
#     print(point)


10 Movies Updated, float and int

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(host="localhost", port=6333)

source_collection = "movie_collection_test_sample_1"
target_collection = "movie_collection_test_sample_1"

points, next_page = client.scroll(
    collection_name=source_collection,
    limit=1000,
    with_payload=True,
    with_vectors=True,  # Retrieve vectors to avoid validation error
)

updated_points = []
for point in points:
    new_payload = dict(point.payload)

    if "movie_id" in new_payload:
        new_payload["movie_id_1"] = int(new_payload["movie_id"])
    if "runtime" in new_payload:
        new_payload["runtime_1"] = int(new_payload["runtime"])

    updated_points.append(
        models.PointStruct(
            id=point.id,
            vector=point.vector,  # Include the original vector
            payload=new_payload,
        )
    )

client.upsert(
    collection_name=target_collection,
    points=updated_points,
)

for up_point in updated_points:
    print(up_point)


In [92]:
# for i in updated_points:
#     # print(i.payload["movie_id"])
#     print(i.payload["runtime_1"])

In [89]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(host="localhost", port=6333)

source_collection = "movie_collection_test_sample_1"
target_collection = "movie_collection_test_sample_1"

limit = 1000
offset = 0

while True:
    points, next_page_offset = client.scroll(
        collection_name=source_collection,
        limit=limit,
        with_payload=True,
        with_vectors=True,
        offset=offset
    )

    if not points:
        break

    updated_points = []
    for point in points:
        new_payload = dict(point.payload)

        # Convert movie_id if it's not None
        if "movie_id" in new_payload and new_payload["movie_id"] is not None:
            new_payload["movie_id_1"] = int(new_payload["movie_id"])

        # Convert runtime if it's not None
        if "runtime" in new_payload and new_payload["runtime"] is not None:
            new_payload["runtime_1"] = int(new_payload["runtime"])

        updated_points.append(
            models.PointStruct(
                id=point.id,
                vector=point.vector,
                payload=new_payload,
            )
        )

    if updated_points:
        client.upsert(
            collection_name=target_collection,
            points=updated_points,
        )

    if next_page_offset is None:
        break

    offset = next_page_offset


Full movies_id is updated with movies_id_1 in, int

In [90]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

limit = 1000
offset = 0
count_with_movie_id_1 = 0
count_with_runtime_1 = 0

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=limit,
        with_payload=True,
        with_vectors=False,
        offset=offset
    )

    if not points:
        break

    for point in points:
        payload = point.payload
        if "movie_id_1" in payload:
            count_with_movie_id_1 += 1
        if "runtime_1" in payload:
            count_with_runtime_1 += 1

    if next_offset is None:
        break

    offset = next_offset

print("Points with movie_id_1:", count_with_movie_id_1)
print("Points with runtime_1:", count_with_runtime_1)


Points with movie_id_1: 230586
Points with runtime_1: 230585


## Adding movie_title, in collection

In [112]:
import pandas as pd
from collections import defaultdict
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"


points_by_movie_id = defaultdict(list)
limit = 10_000
offset = 0

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=limit,
        with_payload=True,
        with_vectors=True,
        offset=offset
    )
    if not points:
        break

    for p in points:
        movie_id_1 = p.payload.get("movie_id_1")
        if movie_id_1 is not None:
            points_by_movie_id[movie_id_1].append(p)

    if next_offset is None:
        break
    offset = next_offset

# 2) For each row in df_details, update payload with new keys
updated_points = []
batch_size = 1000

for _, row in df_details.iterrows():
    # Skip if row has no valid movie_id
    if pd.isna(row["movie_id"]):
        continue

    movie_id_int = int(row["movie_id"])

    # Prepare new fields from the DataFrame
    # (adjust the names if you want different key names in Qdrant)
    movie_title = row.get("title", "")
    movie_overview_1 = row.get("overview", "")
    # movie_release_date = row.get("release_date", "")
    # movie_runtime = row.get("runtime", "")
    movie_original_language = row.get("original_language", "")
    # movie_popularity = row.get("popularity", "")
    # movie_genres_str = row.get("genres_str", "")

    # If we have points matching this movie_id_1, update them
    if movie_id_int in points_by_movie_id:
        for p in points_by_movie_id[movie_id_int]:
            p.payload["movie_title"] = movie_title
            p.payload["movie_overview"] = movie_overview_1
            # p.payload["movie_release_date"] = movie_release_date
            # p.payload["movie_runtime"] = movie_runtime
            p.payload["movie_original_language"] = movie_original_language
            # p.payload["movie_popularity"] = movie_popularity
            # p.payload["movie_genres_str"] = movie_genres_str

            updated_points.append(
                models.PointStruct(
                    id=p.id,
                    vector=p.vector,
                    payload=p.payload
                )
            )

            # Upsert in batches
            if len(updated_points) >= batch_size:
                client.upsert(collection_name, updated_points)
                updated_points.clear()

# Final upsert if any remain
if updated_points:
    client.upsert(collection_name, updated_points)
    updated_points.clear()

print("Done adding DataFrame columns into Qdrant payload.")


Done adding DataFrame columns into Qdrant payload.


Code block to check how many left

In [105]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

info = client.get_collection(collection_name=collection_name)
total_points = info.points_count

print("Total points in collection:", total_points)


Total points in collection: 230586


In [107]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

limit = 1000
offset = 0
has_movie_id_1 = 0
has_runtime_1 = 0

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=limit,
        with_payload=True,
        with_vectors=False,
        offset=offset
    )

    if not points:
        break

    for point in points:
        payload = point.payload
        if "movie_id_1" in payload:
            has_movie_id_1 += 1
        if "runtime_1" in payload:
            has_runtime_1 += 1

    if next_offset is None:
        break

    offset = next_offset

print("Points with movie_id_1:", has_movie_id_1)
print("Points with runtime_1:", has_runtime_1)


Points with movie_id_1: 230586
Points with runtime_1: 230585


In [108]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

limit = 1000
offset = 0

count_movie_id_1_value = 0
count_runtime_1_value = 0

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=limit,
        with_payload=True,
        with_vectors=False,
        offset=offset
    )

    if not points:
        break

    for point in points:
        payload = point.payload

        # Check if movie_id_1 exists and is not None
        if "movie_id_1" in payload and payload["movie_id_1"] is not None:
            count_movie_id_1_value += 1

        # Check if runtime_1 exists and is not None
        if "runtime_1" in payload and payload["runtime_1"] is not None:
            count_runtime_1_value += 1

    if next_offset is None:
        break

    offset = next_offset

print("Points with a non-null movie_id_1:", count_movie_id_1_value)
print("Points with a non-null runtime_1:", count_runtime_1_value)


Points with a non-null movie_id_1: 230586
Points with a non-null runtime_1: 230585


In [109]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

limit = 1000
offset = 0

mismatch_count_movie = 0
mismatch_count_runtime = 0
total_compared_movie = 0
total_compared_runtime = 0

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=limit,
        with_payload=True,
        with_vectors=False,
        offset=offset
    )

    if not points:
        break

    for point in points:
        payload = point.payload

        # Check both original movie_id (float) and new movie_id_1 (int)
        if "movie_id" in payload and "movie_id_1" in payload:
            if payload["movie_id"] is not None and payload["movie_id_1"] is not None:
                total_compared_movie += 1
                if int(payload["movie_id"]) != payload["movie_id_1"]:
                    mismatch_count_movie += 1

        # Check both original runtime (float) and new runtime_1 (int)
        if "runtime" in payload and "runtime_1" in payload:
            if payload["runtime"] is not None and payload["runtime_1"] is not None:
                total_compared_runtime += 1
                if int(payload["runtime"]) != payload["runtime_1"]:
                    mismatch_count_runtime += 1

    if next_offset is None:
        break
    offset = next_offset

print(f"Total points compared (movie_id): {total_compared_movie}")
print(f"Mismatches (movie_id): {mismatch_count_movie}")
print(f"Total points compared (runtime): {total_compared_runtime}")
print(f"Mismatches (runtime): {mismatch_count_runtime}")


Total points compared (movie_id): 230586
Mismatches (movie_id): 0
Total points compared (runtime): 230585
Mismatches (runtime): 0
