# Explore here

In [1]:
import pandas as pd
from dotenv import load_dotenv
import json
import sqlite3
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI
import os
from transformers import AutoTokenizer, AutoModel
import torch

load_dotenv()
openai_api_key = os.environ.get('OPENAI_API_KEY')
client = OpenAI()


In [2]:
movies_df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv")
credits_df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv")

In [11]:
movies_df.head(5)

(4803, 20)

In [4]:
# Create a connection to a new SQLite database
conn = sqlite3.connect('movies.db')

# Store the DataFrames in SQL tables
movies_df.to_sql('movies', conn, if_exists='replace', index=False)
credits_df.to_sql('credits', conn, if_exists='replace', index=False)

# Join the tables using SQL
query = '''
SELECT 
    movies.id,
    movies.title,
    movies.overview,
    movies.genres,
    movies.keywords,
    credits.cast,
    credits.crew
FROM movies
JOIN credits
ON movies.title = credits.title
'''

# Create a unified DataFrame
unified_df = pd.read_sql(query, conn)

# Display the first few rows to ensure the join is correct
print(unified_df.head())


       id                                     title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                            overview  \
0  In the 22nd century, a paraplegic Marine is di...   
1  Captain Barbossa, long believed to be dead, ha...   
2  A cryptic message from Bond’s past sends him o...   
3  Following the death of District Attorney Harve...   
4  John Carter is a war-weary, former military ca...   

                                              genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  [{"id": 28, "name": "Action"}, {"id": 12, "nam...

In [5]:
def parse_json(data, key):
    """ Helper function to parse JSON and extract a specific key """
    try:
        return ' '.join([item[key] for item in json.loads(data)])
    except:
        return ''

def parse_cast(data):
    """ Extract the first three cast members """
    try:
        cast_list = json.loads(data)
        return ' '.join([member['name'] for member in cast_list[:3]])
    except:
        return ''

def parse_crew(data):
    """ Extract the director's name """
    try:
        crew_list = json.loads(data)
        for member in crew_list:
            if member['job'] == 'Director':
                return member['name']
        return ''
    except:
        return ''

# Parse JSON columns
unified_df['genres'] = unified_df['genres'].apply(parse_json, key='name')
unified_df['keywords'] = unified_df['keywords'].apply(parse_json, key='name')
unified_df['cast'] = unified_df['cast'].apply(parse_cast)
unified_df['crew'] = unified_df['crew'].apply(parse_crew)

# Fill None values in overview with empty string
unified_df['overview'] = unified_df['overview'].fillna('')

# Convert overview to a list of words, if it's not already a list
def convert_overview(x):
    if isinstance(x, list):
        return x
    return x.split()

unified_df['overview'] = unified_df['overview'].apply(convert_overview)

# Remove spaces in specified columns
for column in ['genres', 'keywords', 'cast', 'crew']:
    unified_df[column] = unified_df[column].str.replace(' ', '')

# Combine all columns into a single 'tags' column
unified_df['tags'] = unified_df.apply(lambda x: ' '.join(x['overview']) + ' ' + x['genres'] + ' ' + x['keywords'] + ' ' + x['cast'] + ' ' + x['crew'], axis=1)

# Display the first few rows of the transformed DataFrame
print(unified_df[['id', 'title', 'tags']].head())


       id                                     title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                                tags  
0  In the 22nd century, a paraplegic Marine is di...  
1  Captain Barbossa, long believed to be dead, ha...  
2  A cryptic message from Bond’s past sends him o...  
3  Following the death of District Attorney Harve...  
4  John Carter is a war-weary, former military ca...  


In [6]:
# Vectorize the 'tags' column
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(unified_df['tags']).toarray()

# Compute cosine similarity
similarity = cosine_similarity(vectors)

# Recommendation function
def recommend_cv(movie):
    movie_index = unified_df[unified_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movie_list:
        print(unified_df.iloc[i[0]].title)

# Example usage
recommend_cv("Avatar")


Apollo 18
Beowulf
Tears of the Sun
The American
The Book of Life


In [19]:
# Create labels for KNN
labels = np.arange(vectors.shape[0])

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn.fit(vectors, labels)


def recommend_knn(movie):
    movie_index = unified_df[unified_df['title'] == movie].index[0]
    movie_vector = vectors[movie_index].reshape(1, -1)
    distances, indices = knn.kneighbors(movie_vector, n_neighbors=6)
    
    recommended_indices = indices.flatten()[1:]  # Skip the first one since it's the same movie
    
    for i in recommended_indices:
        print(unified_df.iloc[i].title)


recommend_knn("Mission: Impossible")

Mission: Impossible II
Mission: Impossible III
Zero Dark Thirty
Compadres
Bad Company


In [8]:
unified_df["combined"] = (
    "title: " + unified_df.title.str.strip() + "; tags: " + unified_df.tags.str.strip()
)

In [9]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [10]:
'''Can't use OPENAI: RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, 
please check your plan and billing details. For more information on this error, 
read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
'''
# unified_df['ada_embedding'] = unified_df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
# unified_df.to_csv('../data/processed/embedded_movies.csv', index=False)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [13]:
# Select only the 'tags' column
tags_df = unified_df[['id', 'title', 'tags']]

# Export to JSON
tags_df.to_json('tags.json', orient='records', lines=True)

print("Tags column exported to tags.json")


Tags column exported to tags.json


In [15]:
# Load JSON data
input_file = '../data/interim/tags.json'
output_file = '../data/processed/embeddings.json'

with open(input_file, 'r') as f:
    data = [json.loads(line) for line in f]

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Function to generate embeddings
def generate_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate embeddings for each entry
embeddings_data = []
for entry in data:
    text = entry['title'] + " " + entry['tags']
    embedding = generate_embedding(text, tokenizer, model)
    entry['embedding'] = embedding.tolist()  # Convert numpy array to list for JSON serialization
    embeddings_data.append(entry)

# Save embeddings to new file
with open(output_file, 'w') as f:
    for entry in embeddings_data:
        json.dump(entry, f)
        f.write('\n')

print(f"Embeddings of size 1536 have been generated and saved to {output_file}.")

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Embeddings of size 1536 have been generated and saved to ../data/processed/embeddings.json.


In [16]:
# Load the embeddings from the JSON file
embeddings_file = '../data/processed/embeddings.json'

with open(embeddings_file, 'r') as f:
    data = [json.loads(line) for line in f]

# Create a dictionary to map movie titles to embeddings
embeddings_dict = {entry['title']: np.array(entry['embedding']) for entry in data}

# Function to find similar movies based on cosine similarity
def find_similar_movies(movie_title, embeddings_dict, top_n=5):
    if movie_title not in embeddings_dict:
        return f"Movie title '{movie_title}' not found in the dataset."

    movie_embedding = embeddings_dict[movie_title].reshape(1, -1)
    similarities = []

    for title, embedding in embeddings_dict.items():
        if title != movie_title:
            sim = cosine_similarity(movie_embedding, embedding.reshape(1, -1)).flatten()[0]
            similarities.append((title, sim))

    # Sort by similarity score
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    return similarities[:top_n]

In [18]:
# Example usage
movie_title = "Mission: Impossible"
similar_movies = find_similar_movies(movie_title, embeddings_dict)

print(f"Top 5 movies similar to '{movie_title}':")
for title, score in similar_movies:
    print(f"{title} (Similarity: {score:.4f})")

Top 5 movies similar to 'Mission: Impossible':
Mission: Impossible II (Similarity: 0.8074)
Mission: Impossible III (Similarity: 0.8051)
Mission: Impossible - Rogue Nation (Similarity: 0.7233)
Mission: Impossible - Ghost Protocol (Similarity: 0.7065)
The Sentinel (Similarity: 0.5434)
