In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, rank, countDistinct, count
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
import pandas as pd

In [55]:
spark = SparkSession.builder.appName('tourism-destination').getOrCreate()

In [56]:
df_landing = pd.read_csv('dataset/tourism_rating.csv')
# 将缺失值替换为平均值
spark_df_landing =  spark.createDataFrame(df_landing) 
spark_df_landing.show(vertical=True)

-RECORD 0------------
 User_Id       | 1   
 Place_Id      | 179 
 Place_Ratings | 3   
-RECORD 1------------
 User_Id       | 1   
 Place_Id      | 344 
 Place_Ratings | 2   
-RECORD 2------------
 User_Id       | 1   
 Place_Id      | 5   
 Place_Ratings | 5   
-RECORD 3------------
 User_Id       | 1   
 Place_Id      | 373 
 Place_Ratings | 3   
-RECORD 4------------
 User_Id       | 1   
 Place_Id      | 101 
 Place_Ratings | 4   
-RECORD 5------------
 User_Id       | 1   
 Place_Id      | 312 
 Place_Ratings | 2   
-RECORD 6------------
 User_Id       | 1   
 Place_Id      | 258 
 Place_Ratings | 5   
-RECORD 7------------
 User_Id       | 1   
 Place_Id      | 20  
 Place_Ratings | 4   
-RECORD 8------------
 User_Id       | 1   
 Place_Id      | 154 
 Place_Ratings | 2   
-RECORD 9------------
 User_Id       | 1   
 Place_Id      | 393 
 Place_Ratings | 5   
-RECORD 10-----------
 User_Id       | 1   
 Place_Id      | 103 
 Place_Ratings | 3   
-RECORD 11-----------
 User_Id  

In [58]:
df_rec = spark_df_landing.select('User_Id', 'Place_Id', 'Place_Ratings').withColumnRenamed("User_Id","userId")\
                                                                 .withColumnRenamed("Place_Id","itemId")\
                                                                 .withColumnRenamed("Place_Ratings","rating")
df_rec = df_rec.orderBy("userId", "itemId")

In [51]:
popularity_df = df_rec.groupBy('itemId') \
                 .agg(count('*').alias('popularity')) \
                 .orderBy(col('popularity').desc())

In [52]:
user_window = Window.partitionBy("userId").orderBy(col("itemId").desc())
df_rec = df_rec.withColumn("num_items", expr("count(*) over (partition by userId)"))


In [53]:
# For example, 30% of items will be masked
percent_items_to_mask = 0.3
# Determine the number of items to mask for each user
df_rec_final = df_rec.withColumn("num_items_to_mask", (col("num_items") * percent_items_to_mask).cast("int"))
# Masks items for each user
df_rec_final = df_rec_final.withColumn("item_rank", rank().over(user_window))

# Create a StringIndexer model to index the user ID column
indexer_user = StringIndexer(inputCol='userId', outputCol='userIndex').setHandleInvalid("keep")
indexer_item = StringIndexer(inputCol='itemId', outputCol='itemIndex').setHandleInvalid("keep")

# Fit the indexer model to the data and transform the DataFrame
df_rec_final = indexer_user.fit(df_rec_final).transform(df_rec_final)
df_rec_final = indexer_item.fit(df_rec_final).transform(df_rec_final)

# Convert the userIndex column to integer type
df_rec_final = df_rec_final.withColumn('userIndex', df_rec_final['userIndex'].cast('integer'))\
               .withColumn('itemIndex', df_rec_final['itemIndex'].cast('integer'))

train_df_rec = df_rec_final.filter(col("item_rank") > col("num_items_to_mask"))
test_df_rec = df_rec_final.filter(col("item_rank") <= col("num_items_to_mask"))

In [33]:
train_df_rec.show()

+------+------+------+---------+-----------------+---------+---------+---------+
|userId|itemId|rating|num_items|num_items_to_mask|item_rank|userIndex|itemIndex|
+------+------+------+---------+-----------------+---------+---------+---------+
|     1|   307|     4|       30|                9|       10|      199|       82|
|     1|   302|     2|       30|                9|       11|      199|      137|
|     1|   292|     3|       30|                9|       12|      199|      314|
|     1|   265|     5|       30|                9|       13|      199|       21|
|     1|   258|     5|       30|                9|       14|      199|      159|
|     1|   246|     4|       30|                9|       15|      199|       66|
|     1|   222|     3|       30|                9|       16|      199|      336|
|     1|   208|     5|       30|                9|       17|      199|        4|
|     1|   179|     3|       30|                9|       18|      199|      154|
|     1|   154|     2|      

In [34]:
# Configure the ALS model
als = ALS(userCol='userIndex', itemCol='itemIndex', ratingCol='rating',
          coldStartStrategy='drop', nonnegative=True)


param_grid = ParamGridBuilder()\
             .addGrid(als.rank, [1, 20, 30])\
             .addGrid(als.maxIter, [20])\
             .addGrid(als.regParam, [.05, .15])\
             .build()
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=3)

model = cv.fit(train_df_rec)

best_model = model.bestModel
print('rank: ', best_model.rank)
print('MaxIter: ', best_model._java_obj.parent().getMaxIter())
print('RegParam: ', best_model._java_obj.parent().getRegParam())

                                                                                

rank:  1
MaxIter:  20
RegParam:  0.15


In [35]:
model = als.fit(train_df_rec)

# Generate predictions on the test data
predictions = best_model.transform(test_df_rec)
predictions = predictions.withColumn("prediction", expr("CASE WHEN prediction < 1 THEN 1 WHEN prediction > 5 THEN 5 ELSE prediction END"))

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 1.6368707225124695


In [36]:
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.sql.functions import col, collect_list

# Convert the predictions DataFrame to include all predictions per user
# Generate top-k recommendations for each user
userRecs = best_model.recommendForAllUsers(100)  # Top-100 recommendations for each user

# Prepare the input for RankingMetrics
user_ground_truth = test_df_rec.groupby('userIndex').agg(collect_list('itemIndex').alias('ground_truth_items'))
user_train_items = train_df_rec.groupby('userIndex').agg(collect_list('itemIndex').alias('train_items'))

# Join the recommendations and ground truth data on the user ID
user_eval = userRecs.join(user_ground_truth, on='userIndex').join(user_train_items, on='userIndex') \
    .select('userIndex', 'recommendations.itemIndex', 'ground_truth_items', 'train_items', 'recommendations.rating')
user_eval = user_eval.toPandas()
user_eval['itemIndex_filtered'] = user_eval.apply(lambda x:[b for (b,z) in zip(x.itemIndex, x.rating) if b not in x.train_items], axis=1)
user_eval['rating_filtered'] = user_eval.apply(lambda x:[z for (b,z) in zip(x.itemIndex, x.rating) if b not in x.train_items], axis=1)

                                                                                

In [37]:
user_eval

Unnamed: 0,userIndex,itemIndex,ground_truth_items,train_items,rating,itemIndex_filtered,rating_filtered
0,0,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[91, 29, 397, 209, 395, 433, 429, 284, 283, 27...","[165, 164, 102, 426, 162, 21, 339, 67, 159, 13...","[5.2651777267456055, 5.208676815032959, 5.1148...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[5.2651777267456055, 5.208676815032959, 5.1148..."
1,1,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[286, 53, 411, 24, 348, 420, 393, 168, 104, 16...","[366, 267, 196, 195, 265, 265, 190, 36, 36, 0,...","[3.907944917678833, 3.8660085201263428, 3.7963...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[3.907944917678833, 3.8660085201263428, 3.7963..."
2,2,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[243, 115, 91, 140, 171, 170, 204, 43, 281, 13...","[51, 198, 371, 427, 164, 102, 99, 195, 194, 65...","[5.067782402038574, 5.013399600982666, 4.92312...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[5.067782402038574, 5.013399600982666, 4.92312..."
3,3,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[142, 289, 12, 286, 88, 394, 374, 168, 200, 1,...","[337, 130, 311, 363, 64, 64, 230, 35, 261, 261...","[5.758134365081787, 5.696343421936035, 5.59377...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[5.758134365081787, 5.696343421936035, 5.59377..."
4,4,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[244, 142, 292, 54, 324, 112, 53, 28, 171, 283...","[374, 13, 167, 372, 9, 197, 162, 418, 6, 192, ...","[4.4151129722595215, 4.367733955383301, 4.2890...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[4.4151129722595215, 4.367733955383301, 4.2890..."
...,...,...,...,...,...,...,...
295,295,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[2, 74, 349, 140, 169, 137]","[390, 134, 158, 7, 425, 365, 97, 263, 62, 34, ...","[3.515010118484497, 3.477290153503418, 3.41467...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[3.515010118484497, 3.477290153503418, 3.41467..."
296,296,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[286, 413, 380, 53, 420, 394]","[200, 50, 365, 98, 263, 156, 127, 154, 79, 385...","[5.280454635620117, 5.223789691925049, 5.12972...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[5.280454635620117, 5.223789691925049, 5.12972..."
297,297,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[74, 73, 421, 325, 70, 420]","[23, 103, 99, 20, 423, 0, 330, 303, 77, 149, 1...","[5.119409084320068, 5.064472198486328, 4.97327...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[5.119409084320068, 5.064472198486328, 4.97327..."
298,298,"[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[243, 91, 54, 412, 203, 84]","[372, 166, 426, 161, 99, 266, 192, 97, 261, 30...","[4.828000068664551, 4.776190280914307, 4.69018...","[70, 380, 321, 378, 377, 203, 379, 282, 280, 2...","[4.828000068664551, 4.776190280914307, 4.69018..."


In [59]:
from pyspark.ml import Pipeline

# using pipeline 
# Create StringIndexer stages
indexer_user = StringIndexer(inputCol='userId', outputCol='userIndex').setHandleInvalid("keep")
indexer_item = StringIndexer(inputCol='itemId', outputCol='itemIndex').setHandleInvalid("keep")
# Create ALS model
als = ALS(userCol='userIndex', itemCol='itemIndex', ratingCol='rating',
          coldStartStrategy='drop', nonnegative=True)
# Create stages for the pipeline
stages = [indexer_user, indexer_item, als]
# Create a pipeline with the defined stages
pipeline = Pipeline(stages=stages)

# Fit the pipeline to the training data
pipeline_model = pipeline.fit(df_rec)
pipeline_path = "models/als_pipeline"
pipeline_model.save(pipeline_path)

24/01/19 14:34:17 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


In [60]:
# Make predictions on the test data
predictions = pipeline_model.transform(df_rec)
predictions = predictions.withColumn("prediction", expr("CASE WHEN prediction < 1 THEN 1 WHEN prediction > 5 THEN 5 ELSE prediction END"))

# Evaluate the model
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')

Root Mean Squared Error (RMSE): 0.7822967757116109


In [61]:
# 根据用户 id 推荐 10 个景点
user_id = 1
user_recs = pipeline_model.stages[-1].recommendForAllUsers(10).filter(col('userIndex') == user_id).collect()
user_recs = user_recs[0]['recommendations']
user_recs = [i.itemIndex for i in user_recs]
user_recs

                                                                                

[399, 299, 249, 416, 95, 194, 250, 334, 341, 98]

In [63]:
df = pd.read_csv('dataset/tours_truncated.csv')
df[['location_id', 'display_name']].to_csv('dataset/tours.json', index=False)

In [67]:
import csv

# 将 csv 文件读为 Python 字典
file_path = "dataset/tours_truncated.csv"
import pandas as pd


# 读取 CSV 文件并转换为字典
df = pd.read_csv(file_path)
data_dict = dict(zip(df["location_id"], df["display_name"]))

# 打印字典
print(data_dict)


{1: '黄花城长城', 2: '拉萨大昭寺', 3: '八角街', 4: '亚龙湾', 5: '古北水镇', 6: 'TUNTHEIN MR', 7: '中华恐龙园', 8: '广州大剧院', 9: '成都人民公园', 10: '拙政园', 11: '澳门旅游塔会展娱乐中心', 12: '西安钟楼', 13: '荐福寺小雁塔', 14: '夫子庙景区', 15: '香港动植物公园', 16: '三里屯太古里', 17: '客家土楼', 18: '滕王阁', 19: '华强北路商业区', 20: '人民广场', 21: '香港朗豪坊', 22: '三亚湾', 23: '栈桥', 24: '上海欢乐谷', 25: '中环海滨摩天轮', 26: '钟鼓楼', 27: '厦门菽庄花园', 28: '东川红土地', 29: '长隆水上乐园', 30: '国际金融中心一期', 31: '老龙头', 32: '中山路步行街', 33: '香港杜莎夫人蜡像馆', 34: '上海中华艺术宫', 35: '喜洲镇', 36: '月亮山', 37: '罗布林卡', 38: '长江', 39: '义乌国际商贸城', 40: '上海城隍庙', 41: '丹霞山地质公园', 42: '大连中山广场', 43: '安徽九华山', 44: '壶口瀑布', 45: '圣约翰座堂', 46: '维多利亚公园', 47: '厦门钢琴博物馆', 48: '山顶缆车', 49: '黄龙洞', 50: '北京大学', 51: '甘丹寺', 52: '凤凰虹桥', 53: '亚丁风景区', 54: '长白山天池', 55: '蜈支洲岛', 56: '昆明西山森林公园', 57: '圣索菲亚教堂', 58: '古文化街', 59: '中国茶叶博物馆', 60: '圣方济各圣堂', 61: '伏尔加庄园', 62: '六和塔', 63: '三亚大小洞天景区', 64: '车公庙', 65: '上环', 66: '植物园', 67: '银子岩景区', 68: '金光大道度假区', 69: '龙环葡韵住宅式博物馆', 70: '玫瑰圣母堂', 71: '厦门园林植物园', 72: '深圳世界之窗', 73: 'Happy Dragon Tour', 74: '议事亭前地', 75: '河坊街', 76: '五台山',

In [68]:
[data_dict[i] for i in user_recs]

['平遥古城',
 '北京欢乐谷',
 '南丫岛',
 '西湖',
 '香港海洋公园',
 '龟山公园',
 '朱家角古镇',
 '狮子林',
 '大足石刻旅游景区',
 '华清宫']