In [2]:
import openai
import os
import re
import pandas as pd
from tqdm import tqdm
import ast
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import spacy


In [3]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [652]:
#Read in our genre list
genres = pd.read_csv('genres_taxonomy_quick.csv')  # Replace 'genres_taxonomy_quick.csv' with the actual file name
genres = genres['genre']

#Read in our movie data
movies = pd.read_csv("netflix_titles.csv")
movies = movies.sample(n=1000) #This takes a while to run so I didn't do it for the entire dataset at once

In [6]:
os.environ["OPENAI_API_KEY"] = "XXXXXXXXX"  # replace with yours

In [767]:
def predict_genres(movie_description):
    prompt = f"Predict the top three genres for a movie with the following description: {movie_description}"
    response = openai.completions.create(
      model="gpt-3.5-turbo-instruct",  # You can use the GPT-3 model for this task
      prompt=prompt,
      max_tokens=50,
      n=1,
      stop=None,
      temperature=0.2
    )
    predicted_genres = response.choices[0].text.strip()
    return predicted_genres

In [768]:
def filter_predicted_genres(predicted_genres, predefined_genres):
    # Use word embeddings to calculate semantic similarity between predicted and predefined genres
    predicted_genres_tokens = nlp(predicted_genres)
    predicted_genres_tokens = predicted_genres_tokens.text
    # Use regular expression to extract genres
    genres_with_numbers = re.findall(r'\d+\.\s*([^\n]+)', predicted_genres_tokens)
    # Remove leading/trailing whitespaces from each genre
    predicted_genres = [genre.strip().lower() for genre in genres_with_numbers]

    filtered_genres = []
    similarity_scores = []

    for predicted_genre in predicted_genres:
        max_similarity = 0
        best_match = None
        for predefined_genre in predefined_genres:
            similarity_score = nlp(predicted_genre).similarity(nlp(predefined_genre))
            if similarity_score > max_similarity:  # Adjust the threshold as needed
                max_similarity = similarity_score
                best_match = predefined_genre
        filtered_genres.append(best_match)
        similarity_scores.append(max_similarity)

    # Sort the filtered genres based on the similarity scores
    filtered_genres = [x for _, x in sorted(zip(similarity_scores, filtered_genres), reverse=True)]
    
    return filtered_genres

In [770]:
movies['predicted_genres'] = ""

In [771]:
def add_predicted_genres_to_df(df, predefined_genres):   
    # Iterate through the dataframe
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        # Apply the predict_genres function to the movie description
        predicted_genres = predict_genres(row['description'])
        # Prioritize the predicted genres
        filtered_genres = filter_predicted_genres(predicted_genres, predefined_genres)
        # Add the prioritized genres to the dataframe
        df.at[index, 'predicted_genres'] = filtered_genres
    

add_predicted_genres_to_df(movies, genres)


  similarity_score = nlp(predicted_genre).similarity(nlp(predefined_genre))
100%|██████████| 4/4 [00:33<00:00,  8.44s/it]


In [772]:
# Split the lists into separate columns with specific names
movies[['genre1', 'genre2', 'genre3']] = movies['predicted_genres'].apply(lambda x: pd.Series((x + [None, None, None])[:3]))

In [45]:
#I have the tags saved so I don't have to run it again
movies = pd.read_csv("moviesWithTags.csv")

In [46]:
#movies.head()

In [47]:
#Keep only the columns we need for similarity
movies = movies[['title','genre1','genre2','genre3']]

#Drop duplicates
movies = movies.drop_duplicates()

#Set the 'title' column as our index
movies = movies.set_index('title')

In [49]:
# Combine genre columns into a single column
movies['all_genres'] = movies[['genre1', 'genre2', 'genre3']].astype(str).agg(','.join, axis=1)

# Split the genres and create dummy variables for each genre
genres = movies['all_genres'].str.get_dummies(sep=',')

# Concatenate the dummy variables with the original DataFrame
movies = pd.concat([movies, genres], axis=1)

# Drop unnecessary columns
movies.drop(['all_genres', 'genre1', 'genre2', 'genre3'], axis=1, inplace=True)


In [51]:
# If there are duplicate columns due to the one-hot encoding, you can sum them up
movie_genre_matrix = movies.groupby(level=0, axis=1).sum()

# Calculate cosine similarity 
similarity_matrix = cosine_similarity(movie_genre_matrix, movie_genre_matrix)

  movie_genre_matrix = movies.groupby(level=0, axis=1).sum()


In [52]:
def find_similar_movies(movie_name, movie_genre_matrix, num_similar_movies=3):
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(movie_genre_matrix, movie_genre_matrix)
    
    # Find the index of the given movie
    movie_index = movie_genre_matrix.index.get_loc(movie_name)
    
    # Sort and get indices of most similar movies (excluding the movie itself)
    most_similar_indices = np.argsort(similarity_matrix[movie_index])[:-num_similar_movies-1:-1]
    
    # Return the most similar movies
    return movie_genre_matrix.index[most_similar_indices].tolist()

In [53]:
movies = movies.reset_index()

In [54]:
# Example usage
similar_movies = find_similar_movies("Eat Pray Love", movie_genre_matrix, num_similar_movies=4)
print(similar_movies)

['Eat Pray Love', 'The Big Day', 'Love Dot Com: The Social Experiment', '50 First Dates']


In [55]:
movie_title = 'Eat Pray Love'
movies.loc[movies['title'] == movie_title].groupby('title').sum().T.sort_values(by=movie_title,ascending=False).head(3)



title,Eat Pray Love
romantic comedy,1
drama,1
travel/adventure,1


In [56]:
movie_title = '50 First Dates'
movies.loc[movies['title'] == movie_title].groupby('title').sum().T.sort_values(by=movie_title,ascending=False).head(3)



title,50 First Dates
romantic comedy,1
drama,1
action,0


In [67]:
# Example usage
similar_movies = find_similar_movies("Beavis and Butt-head Do America", movie_genre_matrix, num_similar_movies=4)
print(similar_movies)

['Beavis and Butt-head Do America', "Pee-wee's Big Holiday", 'A Shaun the Sheep Movie: Farmageddon', 'The Secret Life of Pets 2']


In [70]:
similar_movies = find_similar_movies("Army of the Dead", movie_genre_matrix, num_similar_movies=4)
print(similar_movies)

['Army of the Dead', 'Fallen', 'RESIDENT EVIL: Infinite Darkness', 'Shooter']


In [73]:
movie_title = "Pee-wee's Big Holiday"
movies.loc[movies['title'] == movie_title].groupby('title').sum().T.sort_values(by=movie_title,ascending=False).head(3)



title,Pee-wee's Big Holiday
comedy,1
adventure,1
fantasy,1


In [None]:
movies.to_csv("moviesWithTags.csv")

# Visualization code

In [None]:
import pandas as pd
from itertools import combinations


In [None]:
movies = pd.read_csv("moviesWithTags.csv")

In [None]:
# Melt the dataframe to unpivot genre columns
melted_df = pd.melt(movies, id_vars=['title'], value_vars=['genre1', 'genre2', 'genre3'], var_name='Genre', value_name='GenreValue')

genre_links = pd.crosstab(index=melted_df['title'], columns=melted_df['GenreValue'])

# Create combinations of genres for each title
combinations_list = []

for title, group in melted_df.groupby('title')['GenreValue']:
    genre_combinations = list(combinations(group, 2))
    combinations_list.extend([(title, combo[0], combo[1]) for combo in genre_combinations])

# Create a new dataframe from the combinations list
combinations_df = pd.DataFrame(combinations_list, columns=['title', 'Genre1', 'Genre2'])

combinations_df = combinations_df[['Genre1','Genre2']]

combinations_df = combinations_df.rename(columns={"Genre1": "source", "Genre2": "target"}, errors="raise")

combinations_df = combinations_df.set_index('source')

combinations_df.to_csv("genreCombos.csv")
combinations_df.to_pickle("genreCombos.pkl")  
