In [1]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

In [5]:
# Which fields do we want to check?

fields_to_check = [
    "movie_id_1",
    "runtime_1",
    "movie_title",
    "movie_overview_1",
    "release_date",
    "movie_original_language",
]


# movie_id: float
# runtime: NoneType, float

# release_date: NoneType, str
# combined_text: str, (embedding) 

# movie_id_1: int
# runtime_1: int

# movie_title: str
# movie_overview: str
# movie_original_language: str

In [6]:
# Keep a count of how many points have "missing" or "null/zero" values for each field
missing_count = {field: 0 for field in fields_to_check}

limit = 1000
offset = 0

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=limit,
        with_payload=True,
        with_vectors=False,
        offset=offset
    )

    if not points:
        break

    for point in points:
        payload = point.payload

        for field in fields_to_check:
            value = payload.get(field, None)
            # Consider it "missing" if None or 0 (you can expand conditions as needed)
            if value is None or value == 0:
                missing_count[field] += 1

    if next_offset is None:
        break
    offset = next_offset

print("Finished scanning all points. Missing/Zero counts:")
for field in fields_to_check:
    print(f"{field}: {missing_count[field]}")


Finished scanning all points. Missing/Zero counts:
movie_id_1: 0
runtime_1: 30061
movie_title: 0
movie_overview_1: 230586
release_date: 6
movie_original_language: 0


In [21]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

limit = 1000
offset = 0

count_zero_or_missing = 0

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=limit,
        with_payload=True,
        with_vectors=False,
        offset=offset,
    )

    if not points:
        break

    for point in points:
        payload = point.payload
        runtime_1 = payload.get("runtime_1", None)
        
        # Check if runtime_1 is missing (None) or zero
        if runtime_1 is None or runtime_1 == 0:
            count_zero_or_missing += 1
            # Print the relevant info for verification
            print(
                f"ID: {point.id}, runtime float: {payload.get('runtime')}, "
                f"runtime_1: {runtime_1}"
            )

    if next_offset is None:
        break
    offset = next_offset


In [22]:
print(f"Total points with missing or zero runtime_1: {count_zero_or_missing}")


Total points with missing or zero runtime_1: 0


In [10]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

# We'll take only 5 points from the collection to test
points, _ = client.scroll(
    collection_name=collection_name,
    limit=5,
    with_payload=True,
    with_vectors=True
)

removed_info = []
updated_points = 0

# Process these 5 points
for point in points:
    payload = dict(point.payload)
    runtime_1 = payload.get("runtime_1", None)

    # If runtime_1 is None or 0, remove it
    if runtime_1 is None or runtime_1 == 0:
        # Save the entire point info in removed_info
        point_data = {"id": point.id}
        point_data.update(payload)
        removed_info.append(point_data)

        # Remove runtime_1 key
        if "runtime_1" in payload:
            del payload["runtime_1"]

        # Upsert to remove the key
        client.upsert(
            collection_name=collection_name,
            points=[
                models.PointStruct(
                    id=point.id,
                    vector=point.vector,  # keep the original vector
                    payload=payload
                )
            ]
        )
        updated_points += 1

# Convert removed_info to a DataFrame
df_removed = pd.DataFrame(removed_info)

print(f"Updated {updated_points} points out of 5 tested.")
print("DataFrame of removed info:")
print(df_removed.head())


Updated 2 points out of 5 tested.
DataFrame of removed info:
                                     id   movie_id  runtime release_date  \
0  00000fb9-5bf1-44d5-a1ed-c844f01de307   112774.0      0.0   2003-01-21   
1  000012ea-6f96-40e0-8452-5c2e4446beef  1059254.0      0.0   2022-12-22   

                                       combined_text  movie_id_1  runtime_1  \
0  Movie name is Black Listed. Its description is...      112774          0   
1  Movie name is Purchè finisca bene - La fortuna...     1059254          0   

                                 movie_title  \
0                               Black Listed   
1  Purchè finisca bene - La fortuna di Laura   

                                      movie_overview movie_original_language  
0  Alan Chambers (Townsend) is a lawyer and sick ...                      en  
1                                                                         it  


In [11]:
df_removed.head()

Unnamed: 0,id,movie_id,runtime,release_date,combined_text,movie_id_1,runtime_1,movie_title,movie_overview,movie_original_language
0,00000fb9-5bf1-44d5-a1ed-c844f01de307,112774.0,0.0,2003-01-21,Movie name is Black Listed. Its description is...,112774,0,Black Listed,Alan Chambers (Townsend) is a lawyer and sick ...,en
1,000012ea-6f96-40e0-8452-5c2e4446beef,1059254.0,0.0,2022-12-22,Movie name is Purchè finisca bene - La fortuna...,1059254,0,Purchè finisca bene - La fortuna di Laura,,it


In [18]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.http import models

def chunked(iterable, chunk_size=10000):
    """
    Utility generator to yield items from `iterable` in chunks of size `chunk_size`.
    """
    current = []
    for item in iterable:
        current.append(item)
        if len(current) == chunk_size:
            yield current
            current = []
    if current:
        yield current

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

BATCH_SIZE_SCROLL = 1000  # batch size for scrolling
offset = 0

removed_points_data = []
ids_to_remove = []

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=BATCH_SIZE_SCROLL,
        with_payload=True,
        with_vectors=False,
        offset=offset
    )

    if not points:
        break

    for point in points:
        payload = point.payload
        runtime_1 = payload.get("runtime_1", None)

        # If runtime_1 is None or 0, mark this entire point for removal
        if runtime_1 is None or runtime_1 == 0:
            ids_to_remove.append(point.id)

            # Keep full payload for reference
            point_data = {"id": point.id}
            point_data.update(payload)  # merges all payload keys
            removed_points_data.append(point_data)

    if next_offset is None:
        break
    offset = next_offset

print(f"Found {len(ids_to_remove)} points to remove in total.")

# Delete in chunks to avoid timeouts
deleted_count = 0
for chunk in chunked(ids_to_remove, chunk_size=10000):
    # Use PointIdsList instead of PointsSelector
    points_selector = models.PointIdsList(points=chunk)

    # Delete these points
    client.delete(collection_name=collection_name, points_selector=points_selector)
    deleted_count += len(chunk)
    print(f"Deleted {len(chunk)} points; total deleted so far: {deleted_count}")

print(f"Finished deletion; total {deleted_count} points removed.")

# Convert removed points' data to DataFrame
df_removed = pd.DataFrame(removed_points_data)
print(f"df_removed shape: {df_removed.shape}")
print(df_removed.head(5))
# Save to CSV if desired:
# df_removed.to_csv("removed_runtime_1_points.csv", index=False)


Found 30061 points to remove in total.
Deleted 10000 points; total deleted so far: 10000
Deleted 10000 points; total deleted so far: 20000
Deleted 10000 points; total deleted so far: 30000
Deleted 61 points; total deleted so far: 30061
Finished deletion; total 30061 points removed.
df_removed shape: (30061, 9)
                                     id   movie_id  runtime release_date  \
0  00000fb9-5bf1-44d5-a1ed-c844f01de307   112774.0      0.0   2003-01-21   
1  000012ea-6f96-40e0-8452-5c2e4446beef  1059254.0      0.0   2022-12-22   
2  0002c5c8-1b21-4fdd-9b30-bd5d827ce091   374168.0      0.0   2015-12-25   
3  0006a41a-5b4d-4072-836f-befdfc5094d3   963339.0      0.0   2008-05-15   
4  0006c229-228e-4d75-8f8b-4bd0947a6b85  1050656.0      0.0   2009-09-25   

                                       combined_text  movie_id_1  \
0  Movie name is Black Listed. Its description is...      112774   
1  Movie name is Purchè finisca bene - La fortuna...     1059254   
2  Movie name is Svatojánsk

In [19]:
print("Sample of removed info:")
df_removed.head()

Sample of removed info:


Unnamed: 0,id,movie_id,runtime,release_date,combined_text,movie_id_1,movie_title,movie_overview,movie_original_language
0,00000fb9-5bf1-44d5-a1ed-c844f01de307,112774.0,0.0,2003-01-21,Movie name is Black Listed. Its description is...,112774,Black Listed,Alan Chambers (Townsend) is a lawyer and sick ...,en
1,000012ea-6f96-40e0-8452-5c2e4446beef,1059254.0,0.0,2022-12-22,Movie name is Purchè finisca bene - La fortuna...,1059254,Purchè finisca bene - La fortuna di Laura,,it
2,0002c5c8-1b21-4fdd-9b30-bd5d827ce091,374168.0,0.0,2015-12-25,Movie name is Svatojánský věneček. Its descrip...,374168,Svatojánský věneček,,cs
3,0006a41a-5b4d-4072-836f-befdfc5094d3,963339.0,0.0,2008-05-15,Movie name is Doğum. Its description is . Movi...,963339,Doğum,,tr
4,0006c229-228e-4d75-8f8b-4bd0947a6b85,1050656.0,0.0,2009-09-25,Movie name is He Ping Jiang Jun Tao Shi Yue. I...,1050656,He Ping Jiang Jun Tao Shi Yue,,zh


In [20]:
df_removed.shape

(30061, 9)

Final Check for Null and Zero vales in all keys

In [23]:
import math
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

limit = 1000
offset = 0

invalid_points_count = 0

while True:
    points, next_offset = client.scroll(
        collection_name=collection_name,
        limit=limit,
        with_payload=True,
        with_vectors=False,
        offset=offset
    )

    if not points:
        break

    for point in points:
        payload = point.payload
        # Check each key-value in the payload for None, NaN, or 0
        # If you want to skip checking some fields, exclude them here
        for key, value in payload.items():
            # Check for None
            if value is None:
                print(f"Point {point.id}, key '{key}' is None.")
                invalid_points_count += 1
                break  # No need to check other keys once invalid found

            # Check for numeric 0 or NaN
            if isinstance(value, (int, float)):
                if value == 0:
                    print(f"Point {point.id}, key '{key}' is 0.")
                    invalid_points_count += 1
                    break
                if isinstance(value, float) and math.isnan(value):
                    print(f"Point {point.id}, key '{key}' is NaN.")
                    invalid_points_count += 1
                    break

            # If you also consider empty strings "" invalid, add:
            # if isinstance(value, str) and not value.strip():
            #     print(f"Point {point.id}, key '{key}' is empty string.")
            #     invalid_points_count += 1
            #     break

    if next_offset is None:
        break
    offset = next_offset

if invalid_points_count == 0:
    print("All points have valid, non-null, non-zero, non-NaN values for their fields.")
else:
    print(f"Found {invalid_points_count} points that have None, zero, or NaN values.")


Point dca74ac1-87f2-4e2b-83ea-d586c60dc1ea, key 'release_date' is None.
Point f3cd33af-d90e-46af-a481-563d6756141e, key 'release_date' is None.
Found 2 points that have None, zero, or NaN values.


In [24]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(host="localhost", port=6333)
collection_name = "movie_collection_test_sample_1"

# The IDs you want to remove
ids_to_remove = [
    "dca74ac1-87f2-4e2b-83ea-d586c60dc1ea",
    "f3cd33af-d90e-46af-a481-563d6756141e",
]

points_selector = models.PointIdsList(points=ids_to_remove)
client.delete(collection_name=collection_name, points_selector=points_selector)

print("Removed the two points with None release_date.")

Removed the two points with None release_date.
