In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ! kaggle datasets download tmdb/tmdb-movie-metadata
# ! unzip tmdb-movie-metadata.zip
# !pip install openai httpx==0.27.2 --force-reinstall --quiet
# !pip install clickhouse-connect

Collecting clickhouse-connect
  Downloading clickhouse_connect-0.8.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting zstandard (from clickhouse-connect)
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting lz4 (from clickhouse-connect)
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Downloading clickhouse_connect-0.8.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (978 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.5/978.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K   [90m━━━

In [None]:
# import os
# os.kill(os.getpid(), 9)

In [None]:
import pandas as pd

movies = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie/tmdb_5000_movies.csv')
credits = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/movie/tmdb_5000_credits.csv')

In [None]:
# Rename the 'movie_id' column in 'credits' to 'id'
credits.rename(columns = {'movie_id':'id'}, inplace = True)
df = credits.merge(movies, on = 'id')
# Remove rows with missing values in the 'overview' column
df.dropna(subset = ['overview'], inplace=True)
# Select only most relevant columns for the final DataFrame
df = df[['id', 'title_x', 'genres', 'overview', 'cast', 'crew']]

In [None]:
import pandas as pd
def generate_corpus(row):
    overview, genre, cast, crew = row['overview'], row['genres'], row['cast'], row['crew']
    corpus = ""
    genre = ','.join([i['name'] for i in eval(genre)])
    cast = ','.join([i['name'] for i in eval(cast)[:3]])
    crew = ','.join(list(set([i['name'] for i in eval(crew) if i['job'] == 'Director' or i['job'] == 'Producer'])))
    corpus += overview + " " + genre + " " + cast + " " + crew
    return pd.Series([corpus, crew, cast, genre], index=['corpus', 'crew', 'cast', 'genres'])

df[['corpus', 'crew', 'cast', 'genres']] = df.apply(generate_corpus, axis=1)

In [None]:
import os
import numpy as np
import openai
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('openai-key')

def get_embeddings(text):
    response = openai.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data

all_embeddings = []

for i in range(5):
    start_idx = i * 1000
    end_idx = (i + 1) * 1000
    print(f"Processing entries from {start_idx} to {end_idx}")

    chunk = df[start_idx:end_idx]
    chunk["corpus"] = chunk["corpus"].astype(str)  # Ensure all entries are strings

    try:
      embeddings = get_embeddings(chunk["corpus"].tolist())
      vectors = [embedding.embedding for embedding in embeddings]
      all_embeddings.extend(vectors)
    except Exception as e:
        print(f"Error processing entries {start_idx}-{end_idx}: {e}")
        break

# Add embeddings back to the DataFrame
embeddings_array = np.array(all_embeddings)
df['embeddings'] = pd.Series(list(embeddings_array))
print(df["embeddings"].head())
print(df["embeddings"].apply(type).value_counts())
df = df[df["embeddings"].apply(lambda x: isinstance(x, (list, np.ndarray)) and all(isinstance(i, (float, np.float32)) for i in x))]


In [None]:
import clickhouse_connect

client = clickhouse_connect.get_client(
    host=userdata.get('hostname'),
    port=443,
    username=userdata.get('usename'),
    password=userdata.get('scalepass')
)

In [None]:
try:
  client.command("""
    CREATE TABLE default.movies (
        id Int64,
        title_x String,
        genres String,
        overview String,
        cast String,
        crew String,
        corpus String,
        embeddings Array(Float32),
        CONSTRAINT check_data_length CHECK length(embeddings) = 1536
    ) ENGINE = MergeTree()
    ORDER BY id
    """)
except Exception as e:
  print(e)

In [None]:
batch_size = 100
num_batches = len(df) // batch_size


for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    batch_data = df[start_idx:end_idx]

    client.insert("default.movies", batch_data.to_records(index=False).tolist(), column_names=batch_data.columns.tolist())
    print(f"Batch {i+1}/{num_batches} inserted.")

client.command("""
ALTER TABLE default.movies
    ADD VECTOR INDEX vector_index embeddings
    TYPE MSTG
""")

Batch 1/47 inserted.
Batch 2/47 inserted.
Batch 3/47 inserted.
Batch 4/47 inserted.
Batch 5/47 inserted.
Batch 6/47 inserted.
Batch 7/47 inserted.
Batch 8/47 inserted.
Batch 9/47 inserted.
Batch 10/47 inserted.
Batch 11/47 inserted.
Batch 12/47 inserted.
Batch 13/47 inserted.
Batch 14/47 inserted.
Batch 15/47 inserted.
Batch 16/47 inserted.
Batch 17/47 inserted.
Batch 18/47 inserted.
Batch 19/47 inserted.
Batch 20/47 inserted.
Batch 21/47 inserted.
Batch 22/47 inserted.
Batch 23/47 inserted.
Batch 24/47 inserted.
Batch 25/47 inserted.
Batch 26/47 inserted.
Batch 27/47 inserted.
Batch 28/47 inserted.
Batch 29/47 inserted.
Batch 30/47 inserted.
Batch 31/47 inserted.
Batch 32/47 inserted.
Batch 33/47 inserted.
Batch 34/47 inserted.
Batch 35/47 inserted.
Batch 36/47 inserted.
Batch 37/47 inserted.
Batch 38/47 inserted.
Batch 39/47 inserted.
Batch 40/47 inserted.
Batch 41/47 inserted.
Batch 42/47 inserted.
Batch 43/47 inserted.
Batch 44/47 inserted.
Batch 45/47 inserted.
Batch 46/47 inserte

['0', 'chi-msc-4aa38d18-msc-4aa38d18-0-0', 'OK', '0', '0']

In [None]:
import numpy as np
from IPython.display import clear_output
genres = []

for i in range(3):
    genre = input("Enter a genre: ")
    genres.append(genre)

genre_string = ', '.join(genres)
genre_embeddings=get_embeddings(genre_string)
embeddings=genre_embeddings[0].embedding
embeddings = np.array(genre_embeddings[0].embedding)  # Convert to numpy array

decay_factor = 0.9  # Adjust as needed for exponential decay

while True:
    clear_output(wait=True)
    results = client.query(f"""
        SELECT title_x, genres,
        distance(embeddings, {embeddings.tolist()}) as dist FROM default.movies ORDER BY dist LIMIT 10
    """)

    # Display the results
    print("Recommended Movies:")
    movies = []
    for row in results.named_results():
        print(row["title_x"])
        movies.append(row['title_x'])

    # Ask the user to select a movie
    selection = int(input("Select a movie (or enter 0 to exit): "))
    if selection == 0:
        break
    selected_movie = movies[selection - 1]

    # Get the embeddings of the selected movie title
    selected_movie_embeddings = get_embeddings(selected_movie)[0].embedding
    selected_movie_embeddings_array = np.array(selected_movie_embeddings)

    # Apply exponential decay and update combined_embeddings
    embeddings = decay_factor * embeddings + (1 - decay_factor) * selected_movie_embeddings_array

    # Normalize the combined embeddings
    embeddings = embeddings / np.linalg.norm(embeddings)

Recommended Movies:
Special
Krampus
Creepshow
On the Waterfront
Deadline - U.S.A.
Top Hat
The Crazies
Warm Bodies
Margaret
Running Forever
Select a movie (or enter 0 to exit): 0
