In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.linear_model import Ridge
from ast import literal_eval

from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import cross_validate, KFold
from surprise.dataset import DatasetAutoFolds

import json
import requests

# **Data Load**

In [2]:
# Step 1: Load CSV with initial dtype as string to clean and remove unnecessary columns
df_movies = pd.read_csv("../data/movies_metadata.csv", dtype=str, low_memory=False)

# Remove columns that are not necessary and might contain non-numeric data
# columns_to_remove = ['poster_path', 'belongs_to_collection', 'homepage', 'tagline', 'video']
# df_movies.drop(columns=columns_to_remove, inplace=True, errors='ignore')

# Step 2: Convert the relevant columns to numeric values (with error handling)
df_movies['budget'] = pd.to_numeric(df_movies['budget'], errors='coerce')
df_movies['popularity'] = pd.to_numeric(df_movies['popularity'], errors='coerce')
df_movies['revenue'] = pd.to_numeric(df_movies['revenue'], errors='coerce')
df_movies['runtime'] = pd.to_numeric(df_movies['runtime'], errors='coerce')
df_movies['vote_average'] = pd.to_numeric(df_movies['vote_average'], errors='coerce')
df_movies['vote_count'] = pd.to_numeric(df_movies['vote_count'], errors='coerce')

# Step 3: Filter out rows where 'id' is not numeric
df_movies = df_movies[pd.to_numeric(df_movies['id'], errors='coerce').notna()]

# Convert 'id' to integer
df_movies['id'] = df_movies['id'].astype(int)
df_movies = df_movies.drop_duplicates(subset=['id'])

# Step 4: Apply the correct dtype specification using astype
df_movies = df_movies.astype({
    'adult': 'str',
    'budget': 'float',
    'genres': 'str',
    'imdb_id': 'str',
    'original_language': 'str',
    'original_title': 'str',
    'overview': 'str',
    'popularity': 'float',  # This should be numeric now
    'production_companies': 'str',
    'production_countries': 'str',
    'release_date': 'str',  # You can convert to datetime later if needed
    'revenue': 'float',
    'runtime': 'float',
    'spoken_languages': 'str',
    'status': 'str',
    'title': 'str',
    'vote_average': 'float',
    'vote_count': 'float'
})

# Step 5: convert imdb_id into integer
df_movies['imdb_id'] = df_movies['imdb_id'].apply(
    lambda x: int(x.replace('tt', '').lstrip('0')) if pd.notna(x) and x.startswith('tt') else None
)
df_movies['imdb_id'] = df_movies['imdb_id'].astype('Int64')

# Step 6: Check the shape and the head of the DataFrame
print(df_movies.shape)
df_movies.head()

(45433, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,11862,113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
df_movies.dtypes

adult                     object
belongs_to_collection     object
budget                   float64
genres                    object
homepage                  object
id                         int64
imdb_id                    Int64
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [4]:
df_movies['imdb_id']

0         114709
1         113497
2         113228
3         114885
4         113041
          ...   
45461    6209470
45462    2028550
45463     303758
45464       8536
45465    6980792
Name: imdb_id, Length: 45433, dtype: Int64

In [5]:
df_credits = pd.read_csv("../data/credits.csv")
df_credits = df_credits.drop_duplicates(subset=['id'])
print(df_credits.shape)
print(df_credits.dtypes)
df_credits.head()

(45432, 3)
cast    object
crew    object
id       int64
dtype: object


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [6]:
df_keywords = pd.read_csv("../data/keywords.csv")
df_keyword = df_keywords.drop_duplicates(subset=['id'])
print(df_keywords.shape)
print(df_keywords.dtypes)
df_keywords.head()

(46419, 2)
id           int64
keywords    object
dtype: object


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [7]:
df_links = pd.read_csv("../data/links.csv")
df_links = df_links[['imdbId', 'tmdbId']]
df_links = df_links.rename(columns={'imdbId': 'imdb_id', 'tmdbId': 'tmdb_id'})
df_links['tmdb_id'] = df_links['tmdb_id'].astype('Int64')
print(df_links.shape)
print(df_links.dtypes)
df_links.head()

(45843, 2)
imdb_id    int64
tmdb_id    Int64
dtype: object


Unnamed: 0,imdb_id,tmdb_id
0,114709,862
1,113497,8844
2,113228,15602
3,114885,31357
4,113041,11862


In [8]:
df_ratings = pd.read_csv("../data/ratings.csv")
print(df_ratings.shape)
print(df_ratings.dtypes)
df_ratings.head()

(26024289, 4)
userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


# **데이터 전처리**

In [9]:
# df_movies, df_credits, df_keywords 병합
df_merged = pd.merge(df_movies, df_credits, on=['id'], how='inner')
df_merged = pd.merge(df_merged, df_keywords, on=['id'], how='inner')
df_merged = pd.merge(df_merged, df_links, on=['imdb_id'], how='inner')
df_merged.shape

(46338, 28)

In [10]:
df_merged.dtypes

adult                     object
belongs_to_collection     object
budget                   float64
genres                    object
homepage                  object
id                         int64
imdb_id                    Int64
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
cast                      object
crew                      object
keywords                  object
tmdb_id                    Int64
dtype: object

In [11]:
df_merged.loc[0, 'cast']

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

# **EDA**

In [12]:
df_movies.loc[0, :]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                          30000000.0
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                             114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [13]:
print(df_keywords.columns)
df_keywords.loc[0, :]['keywords']

Index(['id', 'keywords'], dtype='object')


"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [14]:
print(df_credits.columns)
df_credits[df_credits["id"] == "862"]
# df_credits.loc[0, :]['cast']

Index(['cast', 'crew', 'id'], dtype='object')


Unnamed: 0,cast,crew,id


In [15]:
# TMDB ID, IMDB ID 데이터를 불러와서 활용할 수 있다.

print(df_links.columns)
df_links.loc[0, :]

Index(['imdb_id', 'tmdb_id'], dtype='object')


imdb_id    114709
tmdb_id       862
Name: 0, dtype: Int64

In [16]:
print(df_ratings.columns)
df_ratings.loc[0, :]

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')


userId       1.000000e+00
movieId      1.100000e+02
rating       1.000000e+00
timestamp    1.425942e+09
Name: 0, dtype: float64

In [17]:
user_ids = set(df_ratings['userId'].unique())
user_ids = {str(user_id) for user_id in user_ids}
print(f"total unique users with ratings: {len(user_ids)}")

movie_ids = set(df_ratings['movieId'].unique())
movie_ids = {str(movie_id) for movie_id in movie_ids}
print(f"total unique movies with ratings: {len(movie_ids)}")

total unique users with ratings: 270896
total unique movies with ratings: 45115


In [18]:
print(f"total movies: {len(df_movies)}")
print(f"total movies with ratings: {len(df_movies[df_movies['id'].isin(movie_ids)])}")

total movies: 45433
total movies with ratings: 0


In [19]:
df_movies[df_movies['id'].isin(movie_ids)]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count


# **추천 시스템 구현**

## **1) Demographic Filtering**

In [20]:
C = df_merged['vote_average'].mean()
C

5.613657357448094

In [21]:
m = df_merged['vote_count'].quantile(0.9)
m

156.0

In [22]:
q_movies = df_merged.copy().loc[df_merged['vote_count'] >= m]
q_movies.shape

(4656, 28)

In [23]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [24]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [25]:
# Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

# Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.447114
10294,Dilwale Dulhania Le Jayenge,661.0,9.1,8.434309
834,The Godfather,6024.0,8.5,8.427141
12466,The Dark Knight,12269.0,8.3,8.266272
2840,Fight Club,9678.0,8.3,8.257386
292,Pulp Fiction,8670.0,8.3,8.252519
522,Schindler's List,4436.0,8.3,8.208739
23621,Whiplash,4376.0,8.3,8.207531
5476,Spirited Away,3968.0,8.3,8.198383
2209,Life Is Beautiful,3643.0,8.3,8.18969


## **2) Content Based Filtering**

In [26]:
df_merged['overview'].head(5)

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [27]:
tfidf = TfidfVectorizer(stop_words="english")

df_merged['overview'] = df_merged['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(df_merged['overview'])

tfidf_matrix.shape

(46338, 75764)

In [28]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df_merged.index, index=df_merged['title']).drop_duplicates()

In [29]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    if title not in indices:
        raise ValueError(f"Title '{title}' not found in indices.")
    
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx].flatten()))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies (excluding the first which is itself)
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Ensure indices are valid and within bounds
    max_index = df_merged.shape[0] - 1
    movie_indices = [i for i in movie_indices if i <= max_index]

    # Return the top 10 most similar movies
    return df_merged['title'].iloc[movie_indices]

In [30]:
get_recommendations('The Dark Knight Rises')

12466                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15494                           Batman: Under the Red Hood
585                                                 Batman
21150    Batman Unmasked: The Psychology of the Dark Kn...
9218                    Batman Beyond: Return of the Joker
18009                                     Batman: Year One
19757              Batman: The Dark Knight Returns, Part 1
3091                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [31]:
get_recommendations('The Avengers')

41282             The Work and the Glory
1100     Monty Python and the Holy Grail
34337                       Cheap Smokes
25158                Sir Arne's Treasure
12214                That Hamilton Woman
25067                 Doctor in Distress
Name: title, dtype: object

In [32]:
features = ['cast', 'crew', 'keywords', 'genres']

for feature in features:
    df_merged[feature] = df_merged[feature].apply(literal_eval)

In [33]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [34]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [35]:
# Define new director, cast, genres and keywords features that are in a suitable form.
df_merged['director'] = df_merged['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df_merged[feature] = df_merged[feature].apply(get_list)

In [36]:
df_merged[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [37]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [38]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df_merged[feature] = df_merged[feature].apply(clean_data)

In [39]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

df_merged['soup'] = df_merged.apply(create_soup, axis=1)

In [40]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_merged['soup'])

In [41]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [42]:
df_merged = df_merged.reset_index()
indices = pd.Series(df_merged.index, index=df_merged['title'])

In [43]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

12466      The Dark Knight
10108        Batman Begins
9217                Shiner
9774       Amongst Friends
7688              Mitchell
516      Romeo Is Bleeding
11341         The Prestige
23842            Quicksand
24784             Deadfall
40778                 Sara
Name: title, dtype: object

In [44]:
get_recommendations('The Godfather', cosine_sim2)

1913            The Godfather: Part III
1178             The Godfather: Part II
15421                   The Rain People
18721                         Last Exit
34209                              Rege
35520            Manuscripts Don't Burn
35521            Manuscripts Don't Burn
7917     The Night of the Following Day
18045                 The Son of No One
28414            In the Name of the Law
Name: title, dtype: object

## **3) Collaborative Filtering**

In [45]:
# Pre-filter Unnecessary Data
df_ratings = df_ratings.groupby('movieId').filter(lambda x: len(x) > 10)  # Keep movies rated by more than 10 users
df_ratings = df_ratings.groupby('userId').filter(lambda x: len(x) > 5)    # Keep users who rated more than 5 movies

# Define the Reader and load the dataset
reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

In [46]:
# Initialize the SVD algorithm (Use Stochastic Gradient Descent (SGD) for Faster SVD Fitting)
svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)

# Use cross_validate instead of evaluate
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7937  0.7948  0.7943  0.7938  0.7937  0.7941  0.0005  
MAE (testset)     0.6008  0.6015  0.6010  0.6007  0.6006  0.6009  0.0003  
Fit time          89.52   96.37   96.89   93.69   86.82   92.66   3.91    
Test time         30.69   28.53   26.01   24.04   23.67   26.59   2.68    


In [47]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x419ec6350>

In [48]:
df_ratings[df_ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
5,1,1968,4.0,1425942148
6,1,2762,4.5,1425941300
7,1,2918,5.0,1425941593
8,1,2959,4.0,1425941601
9,1,4226,4.0,1425942228


In [49]:
svd.predict(3, 302)

Prediction(uid=3, iid=302, r_ui=None, est=3.5569135462041306, details={'was_impossible': False})

In [50]:
new_user_id = df_ratings['userId'].max() + 1  # Assign an ID to the new user

# Select random movie IDs from actual movie IDs in df_ratings
movie_ids = df_ratings['movieId'].unique()
random_movie_ids = np.random.choice(movie_ids, size=10, replace=False)

# Generate random user ratings for these movies
new_user_ratings = {int(movie_id): np.random.randint(1, 6) for movie_id in random_movie_ids}

new_user_ratings

{60524: 5,
 58874: 5,
 424: 3,
 45208: 3,
 5607: 3,
 5526: 2,
 58740: 3,
 27865: 3,
 129941: 4,
 115122: 2}

In [51]:
# Step 1: Map raw IDs to inner IDs and collect item factors and biases
item_inner_ids = {}
qi_list = []
bi_list = []
ratings_list = []

for raw_iid, rating in new_user_ratings.items():
    try:
        inner_id = trainset.to_inner_iid(raw_iid)
        item_inner_ids[raw_iid] = inner_id
        qi = svd.qi[inner_id]  # Item factors
        bi = svd.bi[inner_id]  # Item bias
        qi_list.append(qi)
        bi_list.append(bi)
        ratings_list.append(rating)
    except ValueError:
        # The item was not in the training set
        print(f"MovieId {raw_iid} not in training set, skipping.")

# Step 2: Set up the least squares problem to solve for pu and bu
ratings_arr = np.array(ratings_list)
global_mean = trainset.global_mean
bi_arr = np.array(bi_list)
y = ratings_arr - global_mean - bi_arr  # Target variable

qi_arr = np.array(qi_list)  # Design matrix without bias term
X = np.hstack([np.ones((qi_arr.shape[0], 1)), qi_arr])  # Add bias term

# Solve for theta = [bu, pu]
# theta, residuals, rank, s = np.linalg.lstsq(X, y, rcond=None)
# bu = theta[0]
# pu = theta[1:]
# use Ridge regression instead of `np.linalg.lstsq` for efficiency
ridge = Ridge(alpha=1.0)
ridge.fit(X, y)
theta = ridge.coef_
bu = theta[0]
pu = theta[1:]

# Step 3: Predict ratings for all items
# Get all item factors and biases
qi_all = svd.qi
bi_all = svd.bi

# Compute predicted ratings
r_hat = global_mean + bu + bi_all + qi_all.dot(pu)

# Map inner IDs to raw IDs
inner_ids = np.arange(trainset.n_items)
raw_ids = [int(trainset.to_raw_iid(inner_id)) for inner_id in inner_ids]

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({
    'movieId': raw_ids,
    'est_rating': r_hat
})

# Remove movies the user has already rated
rated_movie_ids = set(new_user_ratings.keys())
predictions_df = predictions_df[~predictions_df['movieId'].isin(rated_movie_ids)]

# Step 4: Get top 10 recommendations
top_recommendations = predictions_df.sort_values('est_rating', ascending=False).head(10)

print(f"Top 10 recommendations for new user {new_user_id}:")
print(top_recommendations[['movieId', 'est_rating']])

Top 10 recommendations for new user 270897:
       movieId  est_rating
4934    142115    4.716508
51         527    4.602944
609       1196    4.594887
15424   169906    4.551331
12264   136445    4.543584
19103    86728    4.519422
4839     45210    4.514653
10835    80337    4.505107
144       4993    4.499223
8844    132492    4.493324


In [None]:
top_recommendation_details = pd.merge(
    top_recommendations, 
    df_merged, 
    left_on='movieId', 
    right_on='id'
)

# Select only relevant columns for the JSON response
top_recommendation_details = top_recommendation_details[['movieId', 'imdb_id', 'tmdb_id', 'title', 'release_date', 'genres', 'cast', 'crew', 'keywords', 'est_rating']]

# Convert to dictionary format (or JSON-like structure)
json_response = top_recommendation_details.to_dict(orient='records')

# Print or return the JSON response
print(json.dumps(json_response, indent=4))  # Pretty print the JSON

In [None]:
tmdb_api_key = ' '

tmdb_base_url = "https://api.themoviedb.org/3"
image_base_url = "https://image.tmdb.org/t/p/w500"

tmdb_id = 98491

# Fetch movie details using TMDb ID
def get_movie_details_by_tmdb(tmdb_id):
    url = f"{tmdb_base_url}/movie/{tmdb_id}?api_key={tmdb_api_key}"
    response = requests.get(url)
    return response.json()

# Get thumbnail image URL
def get_thumbnail_url(movie_data):
    if movie_data and 'poster_path' in movie_data:
        return image_base_url + movie_data['poster_path']
    return None

# Example: Using TMDb ID
movie_data = get_movie_details_by_tmdb(tmdb_id)
thumbnail_url = get_thumbnail_url(movie_data)

print(movie_data)

if thumbnail_url:
    print(f"Thumbnail URL: {thumbnail_url}")
else:
    print("No thumbnail available.")

In [None]:
import pickle

# Save the trained SVD model to a file
with open('svd_model.pkl', 'wb') as f:
    pickle.dump(svd, f)