## Library

In [1]:
import os
import re

from pyspark.sql.session import SparkSession
from pyspark.sql.functions import regexp_replace, col, when, split, expr, lower, sum, min, max, concat_ws

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import DenseVector
 

In [3]:
spark = SparkSession.builder \
    .appName("IMDB ML Project") \
    .config("spark.pyspark.python", "python") \
    .getOrCreate()

## Data Proccessing

#### Get Data From Local

In [4]:
data_dir = "../IMBD_ML/Data"
merged_data = None
for folder in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder)
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            if file.startswith("merged_movies_data_") and file.endswith(".csv"):
                file_path = os.path.join(folder_path, file)
                # Read CSV
                data = spark.read.csv(file_path, header=True,inferSchema=True)

                if merged_data is None:
                    merged_data = data
                else:
                    merged_data = merged_data.union(data)
        

In [5]:
merged_data.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Movie Link: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- MPA: string (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Votes: string (nullable = true)
 |-- budget: double (nullable = true)
 |-- grossWorldWide: double (nullable = true)
 |-- gross_US_Canada: double (nullable = true)
 |-- opening_weekend_Gross: double (nullable = true)
 |-- directors: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- countries_origin: string (nullable = true)
 |-- filming_locations: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- Languages: string (nullable = true)
 |-- wins: string (nullable = true)
 |-- nominations: string (nullable = true)
 |-- oscars: string (nullable = true)
 |-- release_date: string (nullable = true)



#### Cleaning Data

In [6]:
# Clean names
for col_name in merged_data.columns:
    merged_data = merged_data.withColumn(col_name, lower(col(col_name)))
    merged_data = merged_data.withColumnRenamed(col_name, col_name.lower().strip().replace(" ","_"))

In [7]:
# Extract ID , drop movie link
merged_data = merged_data.withColumn("movie_link", regexp_replace(col("movie_link"), r"/\?ref_=.*$", ""))
merged_data = merged_data.withColumn("id",\
                                    when(col("movie_link").contains("/"),\
                                    split(col("movie_link"), "/").getItem(4)).otherwise(None))
merged_data = merged_data.drop('movie_link')

In [8]:
# Change Duration to numeric
merged_data = merged_data.withColumn("duration", regexp_replace(col("duration"), "h", ""))
merged_data = merged_data.withColumn("duration", regexp_replace(col("duration"), "m", ""))

merged_data = merged_data.withColumn("duration",
    when(col("duration").contains(" "), 
         (split(col("duration"), " ").getItem(0).cast("float") * expr("3600")) +  # Giờ -> Phút
         split(col("duration"), " ").getItem(1).cast("float") * expr("60"))                   # Phút
    .otherwise(col("duration").cast("float") * expr("60"))  # Nếu chỉ có số phút, giữ nguyên
)


In [9]:
# Change to numeric
col_names = ['rating', 'budget', 'votes', 'grossworldwide', 'gross_us_canada', 'opening_weekend_gross']

for col_name in col_names:
    merged_data = merged_data.withColumn(col_name, col(col_name).cast("float"))
    merged_data = merged_data.withColumn(
        col_name,
        when(col(col_name).contains('k'), regexp_replace(col(col_name), 'k', '').cast("float") * 1_000)
        .when(col(col_name).contains('m'), regexp_replace(col(col_name), 'm', '').cast("float") * 1_000_000)
        .when(col(col_name).contains('b'), regexp_replace(col(col_name), 'b', '').cast("float") * 1_000_000_000)
        .otherwise(col(col_name).cast("float"))
    )

In [10]:
# Remove [ ] ' "
col_names = ['directors','writers','stars','genres','countries_origin','filming_locations','production_companies','languages','wins']
for col_name in col_names:
    merged_data = merged_data.withColumn(col_name, 
                                     regexp_replace(col(col_name), r"[\[\]\'\"]", ""))

In [11]:
# Remove number front title
merged_data = merged_data.withColumn("title", regexp_replace(col("title"), r"^\d+\.\s*", ""))

In [12]:
#  Drop Duplicate
merged_data = merged_data.dropDuplicates(['id'])
# Drop unsuitable
merged_data = merged_data.drop(col('release_date'))

#### Recommend model

In [13]:
col_for_rec = ['id','title','directors','stars','genres','languages']
rec_data = merged_data.select(*col_for_rec)
rec_data = rec_data.withColumn('combind',
                               concat_ws(" ", col("directors"),
                                         col("stars"), col("genres"),
                                         col("languages")))
rec_data = rec_data.withColumn(
    'combind',
    regexp_replace(
        regexp_replace(col('combind'), ', ', ''),  # Thay thế dấu phẩy và dấu cách
        "'", ''  # Thay thế dấu nháy đơn
    )
)

In [14]:
# Tokenizer
token = Tokenizer(inputCol='combind',outputCol='words')
combind_data = token.transform(rec_data)

# IF-IDF
hashingTF = HashingTF(inputCol = "words", outputCol="rawfeatures", numFeatures = 50000)
tf = hashingTF.transform(combind_data)
idf = IDF(inputCol='rawfeatures',outputCol='features')
tf_idf = idf.fit(tf)

# data final
tf_idf_data = tf_idf.transform(tf).select('id','title','features')

In [15]:
def Get_ID_Movie(movie_name, data):
    res = data.filter(col('title').contains(movie_name)).select('id','title')
    return res if res else None

In [16]:
def Recommend_Movie_Based_ID(id_movie, data, top_rec=5):
    # Lấy vector của phim cần tìm
    movie_vector_row = data.filter(data['id'] == id_movie).select("features").collect()
    
    # Kiểm tra nếu không tìm thấy phim
    if not movie_vector_row:
        print(f"Can't find: '{id_movie}' in dataset!")
        return None
    
    movie_vector = movie_vector_row[0]['features']

    # Cosine similarity function
    def cosine_similarity(vec1, vec2):
        vec1, vec2 = DenseVector(vec1), DenseVector(vec2)
        dot_product = float(vec1.dot(vec2))
        norm1 = float(vec1.norm(2))
        norm2 = float(vec2.norm(2))
        return dot_product / (norm1 * norm2) if norm1 * norm2 != 0 else 0.0
    
    cosine_sim_udf = udf(lambda x : cosine_similarity(x,movie_vector), DoubleType())
    
    # Tính cosine similarity
    top_rec_movie = (
        data.withColumn("similarity_score", cosine_sim_udf(col("features")))
        .orderBy(col("similarity_score").desc())
        .select("id", "title")
    )
    
    return top_rec_movie.limit(top_rec)

In [17]:
def Get_Recommend_Movie(name, data):
    list_id = Get_ID_Movie(name,data).select('id').collect()
    
    if len(list_id) == 1:
        return Recommend_Movie_Based_ID(list_id[0]['id'], data)

    if len(list_id) != 1:
        return None

In [None]:
movie_title = 'end game'
res = Get_Recommend_Movie(movie_title,tf_idf_data)
if res:
    print(f'I found multiple movies. Here is a recommendation based on: {movie_title}')
    res.show()
else:
    print(f'I cant find this {movie_title} in database. Please provide more detailed information about the name.')

I found multiple movies. Here is a recommendation based on: end game
