<h1>Import</h1>

In [1]:
from collections import Counter
import kaggle as kg
import zipfile as zf
import dask.dataframe as df
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf, row_number

import ast
import json

import sklearn 



<h2>Load and process data</h2>

In [27]:
movies_df = df.read_csv('data/movies_metadata.csv', delimiter=',', header=0, dtype=str)
movies_df = movies_df[['id', 'original_title', 'genres', 'release_date', 'vote_average', 'vote_count']]
movies_df = movies_df.dropna(subset=['original_title', 'genres'])
movies_df['release_date'] = df.to_datetime(movies_df['release_date'], errors='coerce')

ratings_df = df.read_csv('data/ratings.csv', delimiter=',', header=0, dtype=str)
ratings_df = ratings_df[['userId', 'movieId', 'rating', 'timestamp']]

ratings_df['userId'] = ratings_df['userId'].astype(int)
ratings_df['movieId'] = ratings_df['movieId'].astype(int)
ratings_df['rating'] = ratings_df['rating'].astype(float)

keywords_df = df.read_csv('data/keywords.csv', delimiter=',', header=0, dtype=str)
keywords_df = keywords_df[['id', 'keywords']]

In [26]:
movies_df.head(3)

Unnamed: 0,id,original_title,genres,release_date,vote_average,vote_count
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,7.7,5415
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,6.9,2413
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,6.5,92


In [28]:
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523


In [29]:
keywords_df.head(3)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


<h2>Pivot Table<h2>

In [32]:
from scipy.sparse import csr_matrix


# Convert 'movieId' column to category dtype and ensure known categories
ratings_df['movieId'] = ratings_df['movieId'].astype('category')
ratings_df['movieId'] = ratings_df['movieId'].cat.as_known()

# Create a pivot table
pivot_table = ratings_df.pivot_table(index='userId', columns='movieId', values='rating', aggfunc='mean')

# Compute the pivot table
pivot_table = pivot_table.compute()

