<a href="https://colab.research.google.com/github/RoRdil31/MachineLearning_TermProject/blob/main/Collaborative_filtering_pca_svd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Get movie rating data and movie information data
ratings = pd.read_csv("ratings_small.csv")
movies = pd.read_csv("movies_metadata.csv")

  movies = pd.read_csv("movies_metadata.csv")


In [None]:
# Data Processing
# Change column name
movies.rename(columns={"id": "movieId"}, inplace=True)

# Remove rows with non-numerical values in the movieId column and change the data type
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

movies = movies[movies['movieId'].apply(is_number)]
movies['movieId'] = movies.movieId.astype(np.int64)

# Combine movie ratings and movie information to create a dataset
movie_ratings = pd.merge(ratings, movies, on='movieId')
# print(movie_ratings.head())

# Create a user-item rating matrix and populate with missing values of zero
data = movie_ratings.pivot_table('rating', index='userId', columns='title').fillna(0)
data = data.transpose()

movie_ratings = data[data.index == '!Women Art Revolution'].transpose()
non_zero_values = movie_ratings[movie_ratings['!Women Art Revolution'] != 0]
print(non_zero_values)

title   !Women Art Revolution
userId                       
73                        3.0
402                       3.5


In [None]:
# Calculate cosine similarity
cossim = cosine_similarity(data)
movie_sim_df = pd.DataFrame(cossim, index=data.index, columns=data.index)

# Top 10 movies similar to a particular movie
top_k = 10
similar_movies = movie_sim_df["!Women Art Revolution"].sort_values(ascending=False)[1:top_k+1]
print(similar_movies)

title
Premium Rush            0.996097
Shadow Run              0.759257
The Eleventh Victim     0.759257
The English Patient     0.650791
Stick                   0.650791
Steel                   0.650791
I Can't Sleep           0.650791
The Bride Goes Wild     0.650791
Hornblower: Loyalty     0.650791
The Canterbury Tales    0.650791
Name: !Women Art Revolution, dtype: float64


In [None]:
# Apply PCA
pca = PCA(n_components=10)
data_pca = pca.fit_transform(data)

cossim_pca = cosine_similarity(data_pca)
movie_sim_df_pca = pd.DataFrame(cossim_pca, index=data.index, columns=data.index)

# Top 10 movies similar to a particular movie
top_k = 10
movie_sim_df_pca["!Women Art Revolution"].sort_values(ascending=False)[1:top_k+1]

title
Premium Rush                           0.998450
Paris is Burning                       0.996717
War of the Buttons                     0.995780
Julie                                  0.993127
Hornblower: Loyalty                    0.992657
The Legend of the 7 Golden Vampires    0.992657
Girl Shy                               0.992657
Blind                                  0.992657
Gladiator                              0.992657
The Pet                                0.992374
Name: !Women Art Revolution, dtype: float64

In [None]:
from sklearn.decomposition import TruncatedSVD

# Apply SVD
svd = TruncatedSVD(n_components=10)
data_svd = svd.fit_transform(data)

cossim_svd = cosine_similarity(data_svd)
movie_sim_df_svd = pd.DataFrame(cossim_svd, index=data.index, columns=data.index)

# Top 10 movies similar to a particular movie
top_k = 10
movie_sim_df_svd["!Women Art Revolution"].sort_values(ascending=False)[1:top_k+1]

title
Premium Rush                        0.998569
Paris is Burning                    0.986570
War of the Buttons                  0.976640
Bangkok Traffic Love Story          0.973665
Last Woman on Earth                 0.973499
5 Days of War                       0.969853
'Twas the Night Before Christmas    0.964669
Posti in piedi in paradiso          0.964586
The Pebble and the Penguin          0.963995
Hell's Hinges                       0.961613
Name: !Women Art Revolution, dtype: float64

In [None]:
# Divide data into train and test sets
train_data, test_data = train_test_split(data.transpose(), test_size=0.2, random_state=42)

# Calculate cosine similarity
cossim_train = cosine_similarity(train_data)

# Apply PCA
pca = PCA(n_components=10)
data_pca_train = pca.fit_transform(train_data)
cossim_pca_train = cosine_similarity(data_pca_train)

# Apply SVD
svd = TruncatedSVD(n_components=10)
data_svd_train = svd.fit_transform(train_data)
cossim_svd_train = cosine_similarity(data_svd_train)

In [None]:
# User-item rating prediction function
def predict_ratings(similarity_matrix, ratings_matrix):
    pred = similarity_matrix.dot(ratings_matrix) / np.array([np.abs(similarity_matrix).sum(axis=1)]).T
    return pred

# Calculate the predicted rating of the test data
predicted_ratings_cosine = predict_ratings(cossim_train, train_data)
predicted_ratings_pca = predict_ratings(cossim_pca_train, data_pca_train)
predicted_ratings_svd = predict_ratings(cossim_svd_train, data_svd_train)

# Comparison of predicted and actual ratings
# cosine similarity
test_data_array = np.array(test_data)
predicted_ratings_cosine = predicted_ratings_cosine[:test_data_array.shape[0], :test_data_array.shape[1]]
predicted_ratings_cosine[test_data_array == 0] = 0  # 실제 값이 없는 경우에 대한 예측값을 0으로 설정
mse_cosine = mean_squared_error(test_data_array[test_data_array.nonzero()], predicted_ratings_cosine[test_data_array.nonzero()])

# PCA
test_data_array_pca = np.array(data_pca_train)
predicted_ratings_pca = predicted_ratings_pca[:test_data_array_pca.shape[0], :test_data_array_pca.shape[1]]
predicted_ratings_pca[test_data_array_pca == 0] = 0
mse_pca = mean_squared_error(test_data_array_pca[test_data_array_pca.nonzero()], predicted_ratings_pca[test_data_array_pca.nonzero()])

#SVD
test_data_array_svd = np.array(data_svd_train)
predicted_ratings_svd = predicted_ratings_svd[:test_data_array_svd.shape[0], :test_data_array_svd.shape[1]]
predicted_ratings_svd[test_data_array_svd == 0] = 0
mse_svd = mean_squared_error(test_data_array_svd[test_data_array_svd.nonzero()], predicted_ratings_svd[test_data_array_svd.nonzero()])

# MSE output of ratings predicted by each method
print(f"코사인 유사도를 사용한 예측의 MSE: {mse_cosine}")
print(f"PCA를 사용한 예측의 MSE: {mse_pca}")
print(f"SVD를 사용한 예측의 MSE: {mse_svd}")

코사인 유사도를 사용한 예측의 MSE: 9.788795134502578
PCA를 사용한 예측의 MSE: 14.715668829667662
SVD를 사용한 예측의 MSE: 22.29978761850246
