In [1]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'../resources/fonts/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumGothic'}) # 폰트 설정
plt.rc('font', family='NanumGothic')
import seaborn as sns
import ast

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

from sklearn.metrics.pairwise import cosine_similarity 

In [2]:
# 필요한 데이터를 load 하겠습니다. 경로는 환경에 맞게 지정해주면 됩니다.
movie_path = '../datasets/movies_metadata.csv'
rating_path = '../datasets/ratings.csv'
movie_dt = pd.read_csv(movie_path)
rating_dt = pd.read_csv(rating_path)

In [3]:
movie_dt.shape

(45466, 24)

In [4]:
#결측치가 32000개 이상인건 의미없다고 판단 (영화가 4만 5천개인데 3만5천개면 )75% 결측)
drop_column_list = list(movie_dt.columns[movie_dt.isnull().sum()<=35000])
movie_dt = movie_dt[drop_column_list]

#status가 released가 아닌 영화들은 볼수 없으니 제외
#데이터셋이 과거의 자료여서 post production, In Production 등 production중인 영화들은 이미 개봉했을 수 있지만
#무비 데이터를 최신으로 변경하기 전까지는 현재 데이터셋에선 우선 제거하는걸로
movie_dt = movie_dt[movie_dt['status'] == 'Released'].dropna(subset=['status'])

#지금단계에서 필요없어보이는 column 제거
movie_dt = movie_dt.drop(['overview', 'poster_path', 'tagline', 'status', 'spoken_languages'], axis=1)

In [5]:
movie_dt['popularity'] = movie_dt['popularity'].astype(float)

In [6]:
def extract_genre_names(genres_string):
    genres_list = ast.literal_eval(genres_string)  # 문자열을 파이썬 객체로 변환
    genre_names = [genre['name'] for genre in genres_list]  # 이름만 추출
    return ', '.join(genre_names)  # 쉼표로 연결

# 새로운 열 추가
movie_dt['genre_names'] = movie_dt['genres'].apply(extract_genre_names)

In [7]:
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

# CSR 형식으로 변환
user_item_matrix = csr_matrix((rating_dt['rating'], (rating_dt['userId'], rating_dt['movieId'])))


In [8]:

# ALS 모델 초기화
model = AlternatingLeastSquares(factors=10, regularization=0.1, iterations=20)
# 모델 학습
model.fit(user_item_matrix)


  0%|          | 0/20 [00:00<?, ?it/s]

In [9]:
with open('saved_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [10]:
#유저 아이디를 입력하면 다섯개의 추천 목록을 뽑아준다.
def recommend_movie_by_als(user_id, n_movies=5):
    recommendations = model.recommend(user_id, user_item_matrix[user_id], N=n_movies)
    for ids in recommendations[0]:
        movie_title = movie_dt.loc[movie_dt.index == ids, 'title'].values
        print(movie_title)
    return
    

In [11]:
#새 유저 정보를 넣기위한 아이디 생성기
def make_new_userId():
    return rating_dt['userId'].max()+1

In [12]:
recommend_movie_by_als(3)

["L'Enfer"]
['Guardian Angel']
['The Red Violin']
['Jumanji']
['Last Action Hero']


In [13]:
#cold start를 막기위해, 장르별 popularity가 높은 영화를 순서대로 두개씩 뽑아, 의사를 물어보고 싶다.
#의사를 물어볼 영화 리스트를 리턴
#TODO: 여기.. 유명하긴 한데 별로 안봤을것같은 영화들이 많이 나온다. 알고리즘 바꿀 필요가 있을듯? (예: 타이타닉이 없다???)

def movielist_for_coldstart():
    result = []

    unique_genres = ['Action', 'Fantasy', 'Animation', 'Horror', 'Thriller', 'Comedy', 'Romance', 'Drama']

    for genre in unique_genres:
    # 해당 장르의 인기 영화 중에서 하나 선택
        popular_movie = movie_dt[movie_dt['genres'].str.contains(genre)].nlargest(2, 'popularity')
        result.append(popular_movie)

    result_list = pd.concat(result)
    result_list = result_list.drop_duplicates().head(10)
    return result_list

In [14]:
first_list = movielist_for_coldstart()

In [68]:
first_list.index.array

<NumpyExtensionArray>
[33356, 43644, 42222, 30700, 24455, 42902, 45202, 24351, 23675, 44271]
Length: 10, dtype: int64

In [15]:
#예를들어, 이렇게 하자.
#영화별 평점을 10개만 물어보고, 해당 평점을 토대로 추천하고싶다.

new_user_ratings = {
    'userId': make_new_userId(),  
    'movieId': first_list.index.array,
    'rating': [4, 5, 3, 4, 2, 5, 3, 4, 4, 5]  # 유저가 입력한 평점
    #평점부분 받을수있게 만들어야함
}

# DataFrame으로 변환
new_ratings_df = pd.DataFrame(new_user_ratings)

existing_movie_ids = user_item_matrix.shape[1]

# 신규 유저 평점 벡터 생성
new_user_vector = np.zeros(existing_movie_ids)

# 신규 유저의 평점을 벡터에 채워넣기
for _, row in new_ratings_df.iterrows():
    movie_index = row['movieId']  # 영화 ID
    new_user_vector[movie_index] = row['rating']  # 평점 입력

In [26]:
rating_dt['userId'].max()+1

672

In [28]:
recommend_movie_by_als(671)

['Serial Mom']
["I'm Not Rappaport"]
['Love and a .45']
["Herod's Law"]
['Bogus']


## ===========================================================================================

In [35]:
rating_table = pd.pivot_table(data=rating_dt, index='userId', columns='movieId',values='rating', fill_value= 0)

In [36]:
rating_table.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
matrix = rating_table.values
user_rating_mean = np.mean(matrix, axis=1)
matrix_user_mean = matrix - user_rating_mean.reshape(-1,1)

In [40]:
user_mean_df = pd.DataFrame(data = matrix_user_mean)
user_mean_df 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,...,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625
1,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,3.970770,...,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230,-0.029230
2,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,...,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075
3,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,3.902162,...,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838
4,-0.043128,-0.043128,3.956872,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,...,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,-0.027355,-0.027355,-0.027355,-0.027355,-0.027355,3.972645,-0.027355,-0.027355,-0.027355,-0.027355,...,-0.027355,-0.027355,-0.027355,-0.027355,-0.027355,-0.027355,-0.027355,-0.027355,-0.027355,-0.027355
667,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,...,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273,-0.008273
668,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,...,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677,-0.013677
669,3.986984,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,...,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016,-0.013016


In [48]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(matrix_user_mean, k =12)

In [49]:
print(U.shape)
print(sigma.shape)
print(Vt.shape)

(671, 12)
(12,)
(12, 9066)


In [50]:
sigma = np.diag(sigma)

In [55]:
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_rating_mean.reshape(-1,1)
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = rating_table.columns)
df_svd_preds.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.079686,0.021779,-0.013837,-0.00587,-0.028877,0.032371,0.000715,-0.004428,-0.005219,0.038195,...,-0.004324,-0.004352,0.010478,-0.004256,-0.003944,-0.005674,0.018157,-0.005575,-0.005297,-0.003766
1,1.428452,1.608841,0.529476,0.168278,0.520809,1.107473,0.529719,0.089376,0.29627,1.970031,...,0.013227,-0.002275,0.02068,-0.005245,-0.007644,-0.021019,0.031243,-0.000957,-0.000753,0.026901
2,0.977246,0.396971,0.000299,0.027444,0.021287,0.141458,-0.057134,0.031633,-0.012538,0.383576,...,0.002761,0.004907,-0.01419,-0.000251,-0.006007,-0.003189,-0.026916,0.014637,0.013287,-0.005741
3,1.870844,1.169993,0.252202,0.094831,-0.181713,-0.511953,-0.02782,-0.14308,0.013247,1.461694,...,0.026412,-0.027245,0.054681,0.01845,0.034544,-0.03574,0.088889,-0.019365,-0.017113,0.066559
4,1.182777,0.924903,0.075998,0.061505,0.60268,-0.159825,0.339925,0.081534,-0.079666,0.535018,...,-0.029124,-0.029357,0.009064,-0.029092,-0.03089,-0.057453,0.026344,-0.024027,-0.024614,-0.032752


In [56]:
def recommend_movie(df_svd_preds, user_id, ori_movie_df, ori_rating_df, num_recommendations=5):
    user_row_number = user_id-1
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
    user_data = ori_rating_df[ori_rating_df.userId == user_id]
    user_history = user_data.merge(ori_movie_df, on='movieId').sort_values(['rating'], ascending=False)

    recommendations = ori_movie_df[~ori_movie_df['movieId'].isin(user_history['movieId'])]
    recommendations = recommendations.merge(pd.DataFrame(sorted_user_predictions).reset_index(), on='movieId')
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending=False)
    return user_history, recommendations

In [68]:
movie_dt['movieId'] = movie_dt.index

In [69]:
already_rated, predictions = recommend_movie(df_svd_preds, 330, movie_dt, rating_dt, 10)
already_rated.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95 entries, 47 to 66
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   userId                 95 non-null     int64  
 1   movieId                95 non-null     int64  
 2   rating                 95 non-null     float64
 3   timestamp              95 non-null     int64  
 4   adult                  95 non-null     object 
 5   belongs_to_collection  13 non-null     object 
 6   budget                 95 non-null     object 
 7   genres                 95 non-null     object 
 8   homepage               5 non-null      object 
 9   id                     95 non-null     object 
 10  imdb_id                95 non-null     object 
 11  original_language      95 non-null     object 
 12  original_title         95 non-null     object 
 13  overview               93 non-null     object 
 14  popularity             95 non-null     object 
 15  poster_path 

https://lsjsj92.tistory.com/570

 ## =====================================

In [9]:
item_based_collabor = cosine_similarity(rating_table)
item_based_collabor = pd.DataFrame(data = item_based_collabor)

In [11]:
def get_item_based_collabor(movieId):
    return item_based_collabor[movieId].sort_values(ascending=False)[:6]
get_item_based_collabor(3738)

3738    1.000000
2942    0.307614
2       0.135826
2661    0.109872
6358    0.000000
6352    0.000000
Name: 3738, dtype: float64