In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import matplotlib.pyplot as plt

from utils import SEED

In [None]:
plt.style.use('seaborn-v0_8')

In [None]:
DATA_DIR = 'data/Movies/'

In [None]:
ratings = pd.read_csv(DATA_DIR + 'ratings.csv')

In [None]:
ratings.head()

In [None]:
movies = pd.read_csv(DATA_DIR + 'movies.csv')

In [None]:
movies.head()

In [None]:
movies['movieId'].nunique()

In [None]:
ratings['rating'].unique()

In [None]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')

In [None]:
movies[movies['year'].isna()]

In [None]:
mean_year = movies[~movies['year'].isna()]['year'].astype(int).mean() 
movies['year'].fillna(value=int(mean_year), inplace=True) 

In [None]:
year = movies['year'].astype(int).values

In [None]:
movies

In [None]:
fig, ax = plt.subplots(figsize=(5, 4))
ax.hist(year)
ax.set_xlabel('Year', fontweight='bold')
ax.set_ylabel('Number of Occurrences', fontweight='bold')
ax.set_title('Year of Movie Release Histogram', fontweight='bold')
plt.tight_layout()

In [None]:
scaler = MinMaxScaler()

In [None]:
year_scaled = scaler.fit_transform(year.reshape(-1, 1))

In [None]:
movies['year'] = year_scaled

In [None]:
movies['genres'] = movies['genres'].str.split('|')

In [None]:
movies['genres'].explode().value_counts()

In [None]:
genres_value_counts = movies['genres'].explode().value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(genres_value_counts.index, genres_value_counts.values)
ax.set_xlabel('Genre', fontweight='bold')
ax.set_ylabel('Number of Occurrences', fontweight='bold')
ax.set_title('Genre Histogram', fontweight='bold')
ticks = ax.get_xticklabels()
ax.set_xticks(
    [t.get_position()[0] for t in ticks], 
    [t.get_text() for t in ticks],
    rotation=90
)
plt.tight_layout()

In [None]:
vectorizer = CountVectorizer()

In [None]:
def get_ingredient(row):
    for i, v in enumerate(row):
        row[i] = v.replace(' ', '_').replace('-', '_')
    return ' '.join(row)

genres = movies['genres'].apply(get_ingredient)

In [None]:
genres_encoded = vectorizer.fit_transform(genres)

In [None]:
vectorizer.get_feature_names_out()

In [None]:
genres_df = pd.DataFrame(columns=vectorizer.get_feature_names_out(), data=genres_encoded.toarray())

In [None]:
genres_df.shape

In [None]:
movies_processed = movies.loc[:, ['movieId', 'year']]

In [None]:
movies_processed = pd.concat((movies_processed, genres_df), axis=1)

In [None]:
ratings_sample = ratings[ratings['rating'] >= 4].sample(200000)

In [None]:
common_items_ids = np.intersect1d(
    np.unique(ratings_sample['movieId'].values), 
    np.unique(movies_processed['movieId'].values)
    )

In [None]:
movies_processed = movies_processed[movies_processed['movieId'].isin(common_items_ids)]
ratings_sample = ratings_sample[ratings_sample['movieId'].isin(common_items_ids)]

In [None]:
ratings_sample['movieId'].unique().shape

In [None]:
ratings_sample['userId'].unique().shape

In [None]:
user_mapping = {userid: i for i, userid in enumerate(ratings_sample['userId'].unique())}
item_mapping = {movieid: i for i, movieid in enumerate(movies_processed['movieId'].unique())}

In [None]:
user_ids = torch.LongTensor([user_mapping[i] for i in ratings_sample['userId']])
item_ids = torch.LongTensor([item_mapping[i] for i in ratings_sample['movieId']])
edge_index = torch.stack((user_ids, item_ids))

In [None]:
train_index, val_index = train_test_split(range(len(ratings_sample)), test_size=0.25, random_state=SEED)

train_edge_index = edge_index[:, train_index]
val_edge_index = edge_index[:, val_index]

In [None]:
movies_processed.loc[:, 'ID'] = movies_processed.loc[:, 'movieId'].apply(lambda i: item_mapping[i])
movies_processed.sort_values(by='movieId', inplace=True)

In [None]:
items_features = torch.Tensor(movies_processed.drop(columns=['movieId', 'ID']).values)

In [None]:
users_features = torch.Tensor(np.zeros(ratings_sample.shape[0]).reshape(-1, 1))

In [None]:
movies_dataset = {
    'users_features': users_features,
    'items_features': items_features,
    'train_edge_index': train_edge_index,
    'val_edge_index': val_edge_index
}

In [None]:
with open('datasets/movies_dataset.bin', 'wb') as f:
    pickle.dump(movies_dataset, f)