In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('christmas_movies.csv')

# Map ratings to numeric values
df['rating_numeric'] = df['rating'].map({'G': 1, 'PG': 2, 'PG-13': 3, 'R': 4})

# Convert genres to categorical codes
df['primary_genre'] = df['genre'].str.split(',').str[0]  # Keep only the first genre for simplicity
df['genre_code'] = df['primary_genre'].astype('category').cat.codes

# Removing rows with missing values within the columns used as the features
df = df.dropna(subset=['genre_code', 'imdb_rating', 'rating_numeric', 'runtime', 'director'])

# Prepare the feature matrix
features = df[['imdb_rating', 'runtime', 'rating_numeric', 'genre_code']]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Calculate cosine similarity
cosine_sim = cosine_similarity(features_scaled)
similarity_df = pd.DataFrame(cosine_sim, index=df['title'], columns=df['title'])

In [22]:
# Get top 10 similar movies for "Home Alone"
query_movie = 'Home Alone'
top_10_similar = similarity_df[query_movie].nlargest(11).iloc[1:]  # Exclude itself
print("Top 10 similar movies to", query_movie)
print(top_10_similar)

Top 10 similar movies to Home Alone
title
A Christmas Story Christmas       0.970176
8-Bit Christmas                   0.955266
The Santa Clause                  0.941442
Klaus                             0.879460
The Man Who Invented Christmas    0.879028
Christmas in the Clouds           0.827919
The Bishop's Wife                 0.812779
It's a Wonderful Life             0.808788
A Christmas Carol                 0.747994
Rise of the Guardians             0.747567
Name: Home Alone, dtype: float64


In [23]:
# Get top 10 similar movies for "Elf"
query_movie = 'Elf'
top_10_similar = similarity_df[query_movie].nlargest(11).iloc[1:]  # Exclude itself
print("Top 10 similar movies to", query_movie)
print(top_10_similar)

Top 10 similar movies to Elf
title
A Christmas Carol                     0.997666
Arthur Christmas                      0.983405
The Nativity Story                    0.976120
Rise of the Guardians                 0.974280
The Christmas Chronicles              0.964409
Thomas Kinkade's Christmas Cottage    0.948533
Klaus                                 0.866735
The Man Who Invented Christmas        0.827856
Santa Claus                           0.802840
The Grinch                            0.767461
Name: Elf, dtype: float64


In [24]:
# Get top 10 similar movies for "Love Actually"
query_movie = 'Love Actually'
top_10_similar = similarity_df[query_movie].nlargest(11).iloc[1:]  # Exclude itself
print("Top 10 similar movies to", query_movie)
print(top_10_similar)

Top 10 similar movies to Love Actually
title
Instant Family             0.979487
Silver Linings Playbook    0.978802
The Best Man Holiday       0.959281
This Christmas             0.958623
Spirited                   0.953114
Almost Christmas           0.924353
Last Christmas             0.918615
Happiest Season            0.901877
Die Hard                   0.894952
Scrooged                   0.865927
Name: Love Actually, dtype: float64
