In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, lit
from pyspark.ml.feature import HashingTF, IDF, StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# 1Ô∏è‚É£ Kh·ªüi t·∫°o Spark Session
spark = SparkSession.builder.appName("YelpHybridRecommendationTFIDF").getOrCreate()

# 2Ô∏è‚É£ Load dataset (Gi·∫£ s·ª≠ c√≥ 3 file: users, businesses, reviews)
businesses = spark.read.json("yelp_academic_dataset_business.json")
reviews = spark.read.json("yelp_academic_dataset_review.json")

# Ch·ªçn c√°c c·ªôt quan tr·ªçng
businesses = businesses.select("business_id", "categories", "stars", "review_count")
reviews = reviews.select("user_id", "business_id", "stars")

# 3Ô∏è‚É£ X·ª≠ l√Ω Content-Based Filtering v·ªõi TF-IDF
# T√°ch categories th√†nh danh s√°ch
businesses = businesses.withColumn("categories", split(col("categories"), ", "))

# HashingTF ƒë·ªÉ chuy·ªÉn ƒë·ªïi categories th√†nh vector TF
hashing_tf = HashingTF(inputCol="categories", outputCol="tf_features", numFeatures=1000)
businesses = hashing_tf.transform(businesses)

# T√≠nh IDF ƒë·ªÉ ƒëi·ªÅu ch·ªânh tr·ªçng s·ªë t·ª´
idf = IDF(inputCol="tf_features", outputCol="tfidf_features")
idf_model = idf.fit(businesses)
businesses = idf_model.transform(businesses)

# 4Ô∏è‚É£ Collaborative Filtering (CF) b·∫±ng ALS
# Chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu Review th√†nh Spark DataFrame
reviews = reviews.withColumn("stars", col("stars").cast("float"))

# Index h√≥a user_id v√† business_id
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index")
business_indexer = StringIndexer(inputCol="business_id", outputCol="business_index")

reviews = user_indexer.fit(reviews).transform(reviews)
reviews = business_indexer.fit(reviews).transform(reviews)

# X√¢y d·ª±ng m√¥ h√¨nh ALS
als = ALS(
    userCol="user_index",
    itemCol="business_index",
    ratingCol="stars",
    nonnegative=True,
    implicitPrefs=False,
    coldStartStrategy="drop"
)

# Train m√¥ h√¨nh ALS
als_model = als.fit(reviews)

# 5Ô∏è‚É£ K·∫øt h·ª£p Hybrid Filtering
# H√†m g·ª£i √Ω d·ª±a tr√™n Collaborative Filtering
def get_cf_recommendations(user_id, top_n=10):
    user_df = spark.createDataFrame([(user_id,)], ["user_id"])
    recommendations = als_model.recommendForUserSubset(user_df, top_n)
    recommendations = recommendations.withColumn("recommendations", explode("recommendations"))
    recommendations = recommendations.select("user_id", col("recommendations.business_id"), col("recommendations.rating"))
    return recommendations

# H√†m g·ª£i √Ω d·ª±a tr√™n Content-Based Filtering v·ªõi TF-IDF
def get_cbf_recommendations(business_id, top_n=10):
    target_tfidf = businesses.filter(col("business_id") == business_id).select("tfidf_features").collect()[0][0]
    businesses_with_sim = businesses.withColumn("similarity", lit(target_tfidf.dot(col("tfidf_features"))))
    return businesses_with_sim.orderBy(col("similarity").desc()).limit(top_n)

# H√†m g·ª£i √Ω d·ª±a tr√™n Hybrid Filtering
def get_hybrid_recommendations(user_id, business_id, alpha=0.5, top_n=10):
    cf_recs = get_cf_recommendations(user_id, top_n)
    cbf_recs = get_cbf_recommendations(business_id, top_n)

    # G√°n tr·ªçng s·ªë
    cf_recs = cf_recs.withColumn("score", col("rating") * alpha)
    cbf_recs = cbf_recs.withColumn("score", col("stars") * (1 - alpha))

    # G·ªôp hai danh s√°ch
    hybrid_recs = cf_recs.union(cbf_recs).groupBy("business_id").agg({"score": "sum"})
    hybrid_recs = hybrid_recs.orderBy(col("sum(score)").desc()).limit(top_n)

    return hybrid_recs

# 6Ô∏è‚É£ ƒê√°nh gi√° m√¥ h√¨nh
# RMSE cho CF
evaluator = RegressionEvaluator(metricName="rmse", labelCol="stars", predictionCol="prediction")
rmse = evaluator.evaluate(als_model.transform(reviews))
print(f"RMSE c·ªßa m√¥ h√¨nh CF: {rmse:.4f}")

# üõ†Ô∏è Ch·∫°y th·ª≠ nghi·ªám h·ªá th·ªëng Hybrid v·ªõi TF-IDF
user_sample = "user_id_123"
business_sample = "business_id_456"
recommendations = get_hybrid_recommendations(user_sample, business_sample, alpha=0.6, top_n=10)
recommendations.show()
