In [135]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [136]:
ratings = pd.read_csv('./The Movies Dataset/ratings_small.csv')
movies = pd.read_csv('./The Movies Dataset/movies_metadata.csv')

  movies = pd.read_csv('./The Movies Dataset/movies_metadata.csv')


In [137]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [138]:
ratings.drop(['timestamp'], axis=1, inplace=True)


In [139]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [140]:
movies.rename(columns={'id': 'movieId'}, inplace=True)

In [141]:
movies.drop(['homepage', 'imdb_id'], axis=1, inplace=True)
movies.shape

(45466, 22)

In [142]:
ratings['movieId'] = ratings['movieId'].astype(str)

In [143]:
ratings_movies = pd.merge(ratings, movies, on='movieId')

In [144]:
ratings_movies.head(1)

Unnamed: 0,userId,movieId,rating,adult,belongs_to_collection,budget,genres,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,1,1371,2.5,False,"{'id': 1575, 'name': 'Rocky Collection', 'post...",17000000,"[{'id': 18, 'name': 'Drama'}]",en,Rocky III,"Now the world champion, Rocky Balboa is living...",...,1982-05-28,270000000.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest challenge.,Rocky III,False,6.6,894.0


In [145]:
# 문장 임베딩을 위한 sentence-transformers 라이브러리 설치
# ! pip install sentence-transformers pandas

In [146]:
# 사전 학습된 모델 선택
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
sentence_transformer = SentenceTransformer(model_name)

scaler = MinMaxScaler()

def embedding(dataFrame, category, embedding_dim=32):
		dataFrame[category] = dataFrame[category].fillna('')
		# 텍스트를 32차원 벡터로 변환
		embeddings = sentence_transformer.encode(
										dataFrame[category].tolist(), show_progress_bar=True)

		# 결과가 32차원이 아닌 경우, PCA 등을 이용하여 차원 축소
		if embeddings.shape[1] != embedding_dim:
				pca = PCA(n_components=embedding_dim)
				embeddings = pca.fit_transform(embeddings)

		# 임베딩 결과를 원래 DataFrame에 교체
		dataFrame[category] = embeddings.tolist()
		return dataFrame

def normalized(dataFrame, category):
    normalized_value = scaler.fit_transform(dataFrame[[category]])
    dataFrame[category] = normalized_value
    return dataFrame

def one_hot_encoding(dataFrame, category):
    one_hot_encoded = pd.get_dummies(dataFrame[category])
    dataFrame = pd.concat([dataFrame, one_hot_encoded], axis=1)
    dataFrame.drop(category, axis=1, inplace=True)
    return dataFrame

def data_preprocessing(dataFrame):
		dataFrame = one_hot_encoding(dataFrame, 'adult')
		dataFrame = one_hot_encoding(dataFrame, 'original_language')
		dataFrame = normalized(dataFrame, 'budget')
		dataFrame = normalized(dataFrame, 'revenue')
		dataFrame = normalized(dataFrame, 'runtime')


In [147]:
# genres, spoken_languages column을 위한 one_hot_encoding 함수
def one_hot_encoding_2(dataFrame, category, id_value):

		category_ids = category + '_ids'
		# 'genre' 열에서 id 추출
		dataFrame[category_ids] = dataFrame[category].apply(
                    lambda categories_str: [x[id_value] for x in ast.literal_eval(categories_str)])

		# MultiLabelBinarizer 객체 생성 및 fitting
		mlb = MultiLabelBinarizer()
		encoded_categories = mlb.fit_transform(dataFrame[category_ids])

		# one-hot-encoding된 결과를 DataFrame 형태로 변환
		encoded_categories_df = pd.DataFrame(encoded_categories, columns=[
                    f"{category}_{id}" for id in mlb.classes_])

		# 원래의 DataFrame과 one-hot-encoding된 DataFrame을 결합
		dataFrame = pd.concat([dataFrame, encoded_categories_df], axis=1)

		# 'genre'와 'genre_ids' 열 삭제
		dataFrame = dataFrame.drop(columns=[category, category_ids])
		return dataFrame


In [148]:
ratings_movies = ratings_movies.filter(
    ['userId', 'movieId', 'rating', 'genres'])

ratings_movies = one_hot_encoding_2(ratings_movies, 'genres', 'id')
ratings_movies.head()


Unnamed: 0,userId,movieId,rating,genres_12,genres_14,genres_16,genres_18,genres_27,genres_28,genres_35,...,genres_80,genres_99,genres_878,genres_9648,genres_10402,genres_10749,genres_10751,genres_10752,genres_10769,genres_10770
0,1,1371,2.5,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,1371,4.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,1371,3.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,19,1371,4.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,21,1371,3.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [149]:
X = ratings_movies.drop('rating', axis=1)
y = ratings_movies['rating']

In [150]:
# train과 test 셋으로 분리.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)