In [6]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RecommendationSystem") \
    .getOrCreate()

file_path = "/home/lplab/Documents/220962067/movies 1.json"

# Load the JSON file into a DataFrame
df = spark.read.json(file_path)

# Show the DataFrame schema and some rows
df.printSchema()
df.show()



root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This m

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark session
spark = SparkSession.builder \
    .appName("ALS Example") \
    .getOrCreate()

# Load data from JSON file
data_path = "/home/lplab/Documents/220962067/movies 1.json"
data = spark.read.json(data_path)

# Show initial schema and data
data.printSchema()
data.show(5)

# Convert columns to appropriate types if needed
data = data.withColumn("score", col("score").cast("double"))

# Filter out rows with null values in `user_id`
data = data.filter(col("user_id").isNotNull())

# Indexing user_id and product_id columns
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index")
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index")

data = user_indexer.fit(data).transform(data)
data = product_indexer.fit(data).transform(data)

# Show indexed data
data.select("user_id", "user_index", "product_id", "product_index", "score").show(5)

# Prepare training and test data
(training_data, test_data) = data.randomSplit([0.8, 0.2])

# Initialize ALS model
als = ALS(
    maxIter=10,
    regParam=0.01,
    userCol="user_index",
    itemCol="product_index",
    ratingCol="score",
    coldStartStrategy="drop"
)

# Train the model
model = als.fit(training_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="score",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")

# Show some predictions
predictions.show(5)

# Stop the Spark session
spark.stop()


root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This m

24/08/20 15:14:24 WARN DAGScheduler: Broadcasting large task binary with size 1418.2 KiB
24/08/20 15:14:24 WARN DAGScheduler: Broadcasting large task binary with size 1449.0 KiB
24/08/20 15:14:24 WARN DAGScheduler: Broadcasting large task binary with size 1451.3 KiB


+--------------+----------+----------+-------------+-----+
|       user_id|user_index|product_id|product_index|score|
+--------------+----------+----------+-------------+-----+
|A141HP4LYPWMSR|      32.0|B003AI2VGA|        731.0|  3.0|
|A328S9RN3U5M68|       3.0|B003AI2VGA|        731.0|  3.0|
|A1I7QGUDP043DG|     312.0|B003AI2VGA|        731.0|  5.0|
|A1M5405JH9THP9|   10917.0|B003AI2VGA|        731.0|  3.0|
| ATXL536YX71TR|     173.0|B003AI2VGA|        731.0|  3.0|
+--------------+----------+----------+-------------+-----+
only showing top 5 rows



24/08/20 15:14:24 WARN DAGScheduler: Broadcasting large task binary with size 1452.8 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1454.1 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1453.1 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1454.4 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1455.1 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1458.2 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1459.6 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1461.0 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1462.3 KiB
24/08/20 15:14:25 WARN DAGScheduler: Broadcasting large task binary with size 1463.7 KiB
24/08/20 15:14:26 WARN DAGScheduler: Broadcasting large task binary with size 1465.1 KiB
24/08/20 15:14:26 WAR

Root-mean-square error = 4.071405688366229


24/08/20 15:14:28 WARN DAGScheduler: Broadcasting large task binary with size 1491.7 KiB


+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+----------+-------------+-----------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|user_index|product_index| prediction|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+----------+-------------+-----------+
|        0/1|B002OHDRF2|        P. Montrevil|Lots of action, b...|  3.0|Good but not like...|1263427200|A2PWAPST6JHJWY|    3749.0|         21.0|  2.9794176|
|       1/10|B002OHDRF2|     Joel P. Fleming|I won't add any s...|  2.0|Ending... (No Spo...|1248480000| A2Y88A36T4NTX|    3997.0|         21.0|  1.9862783|
|        7/7|B0001G6PZC|      David Anderson|"The Last Samarai...|  5.0|Tom Cruise Triump...|1080086400|A1YQ6QB2127AJ4|     471.0|          7.0|  3.2017674|
|        1/1|0790747324|Randy Scarborough...|This is one o

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ALS Collaborative Filtering") \
    .getOrCreate()

# Load the JSON file into a DataFrame
# Replace 'path/to/your/jsonfile.json' with the actual path to your JSON file
df = spark.read.json('/home/lplab/Documents/220962067/movies 1.json')

# Show the schema and some data
df.printSchema()
df.show(5, truncate=False)

# Prepare the data
# Select the relevant columns and rename them for ALS
data = df.select(col('user_id').alias('user'), 
                 col('product_id').alias('item'), 
                 col('score').alias('rating'))

# Cast user and item columns to numeric indices
from pyspark.ml.feature import StringIndexer

user_indexer = StringIndexer(inputCol='user', outputCol='userIndex')
item_indexer = StringIndexer(inputCol='item', outputCol='itemIndex')

data = user_indexer.fit(data).transform(data)
data = item_indexer.fit(data).transform(data)

# Show the data with indexed columns
data.show(5, truncate=False)

# Split the data into training and test sets
training_data, test_data = data.randomSplit([0.8, 0.2])

# Build the ALS model
als = ALS(
    maxIter=10,
    regParam=0.01,
    rank=10,
    userCol='userIndex',
    itemCol='itemIndex',
    ratingCol='rating',
    coldStartStrategy='drop'
)

model = als.fit(training_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model using RMSE
evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='rating',
    predictionCol='prediction'
)

rmse = evaluator.evaluate(predictions)
print(f'Root-Mean-Square Error (RMSE): {rmse}')

# Generate some example recommendations
user_recs = model.recommendForAllUsers(5)
user_recs.show(5, truncate=False)

item_recs = model.recommendForAllItems(5)
item_recs.show(5, truncate=False)

# Stop the SparkSession
spark.stop()

root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)

+-----------+----------+------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

24/08/20 15:23:57 WARN DAGScheduler: Broadcasting large task binary with size 1417.9 KiB
24/08/20 15:23:57 WARN DAGScheduler: Broadcasting large task binary with size 1445.9 KiB
24/08/20 15:23:57 WARN DAGScheduler: Broadcasting large task binary with size 1448.2 KiB


+--------------+----------+------+---------+---------+
|user          |item      |rating|userIndex|itemIndex|
+--------------+----------+------+---------+---------+
|A141HP4LYPWMSR|B003AI2VGA|3.0   |32.0     |731.0    |
|A328S9RN3U5M68|B003AI2VGA|3.0   |3.0      |731.0    |
|A1I7QGUDP043DG|B003AI2VGA|5.0   |312.0    |731.0    |
|A1M5405JH9THP9|B003AI2VGA|3.0   |10917.0  |731.0    |
|ATXL536YX71TR |B003AI2VGA|3.0   |173.0    |731.0    |
+--------------+----------+------+---------+---------+
only showing top 5 rows



24/08/20 15:23:57 WARN DAGScheduler: Broadcasting large task binary with size 1449.8 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1451.0 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1450.0 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1451.3 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1452.1 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1455.1 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1456.5 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1457.9 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1459.3 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1460.7 KiB
24/08/20 15:23:58 WARN DAGScheduler: Broadcasting large task binary with size 1462.0 KiB
24/08/20 15:23:59 WAR

Root-Mean-Square Error (RMSE): 4.235059683566521


24/08/20 15:24:03 WARN DAGScheduler: Broadcasting large task binary with size 1530.6 KiB
24/08/20 15:24:04 WARN DAGScheduler: Broadcasting large task binary with size 1537.2 KiB


+---------+------------------------------------------------------------------------------------------+
|userIndex|recommendations                                                                           |
+---------+------------------------------------------------------------------------------------------+
|31       |[{513, 31.485573}, {733, 30.679256}, {587, 25.936277}, {413, 24.498417}, {501, 24.439445}]|
|53       |[{513, 28.795568}, {440, 28.114601}, {404, 25.281174}, {362, 22.203552}, {418, 21.875801}]|
|65       |[{543, 21.414267}, {467, 20.633917}, {527, 19.10141}, {426, 19.003853}, {753, 18.295755}] |
|78       |[{753, 17.899164}, {491, 17.048523}, {621, 16.996994}, {426, 16.270195}, {632, 16.147694}]|
|85       |[{513, 37.635693}, {693, 34.273727}, {683, 30.934855}, {684, 30.93485}, {411, 29.802696}] |
+---------+------------------------------------------------------------------------------------------+
only showing top 5 rows





+---------+----------------------------------------------------------------------------------------+
|itemIndex|recommendations                                                                         |
+---------+----------------------------------------------------------------------------------------+
|1        |[{278, 9.62098}, {81, 9.574989}, {11, 8.840216}, {87, 8.491468}, {301, 8.000879}]       |
|3        |[{803, 6.9309187}, {197, 6.314759}, {1034, 6.1647205}, {483, 6.146794}, {195, 6.142402}]|
|5        |[{142, 6.0659676}, {468, 5.913543}, {181, 5.8291025}, {172, 5.6990066}, {294, 5.428657}]|
|6        |[{110, 7.2183495}, {36, 6.525296}, {177, 6.3334436}, {696, 5.2882795}, {54, 5.2805076}] |
|9        |[{84, 8.8535185}, {197, 8.538243}, {110, 8.11378}, {165, 7.863947}, {145, 7.64903}]     |
+---------+----------------------------------------------------------------------------------------+
only showing top 5 rows



24/08/20 15:24:05 WARN DAGScheduler: Broadcasting large task binary with size 1530.2 KiB
                                                                                

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ALS Collaborative Filtering") \
    .getOrCreate()

# Load the JSON file into a DataFrame
df = spark.read.json('/home/lplab/Documents/220962067/movies 1.json')

# Display schema and sample data
df.printSchema()
df.show(5)

# Prepare the data
data = df.select(col('user_id').alias('user'), 
                 col('product_id').alias('item'), 
                 col('score').alias('rating'))

# Convert user_id and product_id to numeric indices
user_indexer = StringIndexer(inputCol='user', outputCol='userIndex')
item_indexer = StringIndexer(inputCol='item', outputCol='itemIndex')

data = user_indexer.fit(data).transform(data)
data = item_indexer.fit(data).transform(data)

# Split the data into training and test sets
(training_data, test_data) = data.randomSplit([0.8, 0.2])

# Build the ALS model
als = ALS(
    maxIter=10,
    regParam=0.01,
    rank=10,
    userCol='userIndex',
    itemCol='itemIndex',
    ratingCol='rating',
    coldStartStrategy='drop'
)

# Train the model
model = als.fit(training_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model using RMSE
rmse_evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='rating',
    predictionCol='prediction'
)
rmse = rmse_evaluator.evaluate(predictions)
print(f'Root-Mean-Square Error (RMSE): {rmse}')

# Evaluate the model using MAE
mae_evaluator = RegressionEvaluator(
    metricName='mae',
    labelCol='rating',
    predictionCol='prediction'
)
mae = mae_evaluator.evaluate(predictions)
print(f'Mean Absolute Error (MAE): {mae}')

# Stop the SparkSession
spark.stop()


root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This m

24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1445.9 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1448.2 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1449.8 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1451.0 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1450.0 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1451.3 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1452.1 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1455.1 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1456.5 KiB
24/08/20 15:33:23 WARN DAGScheduler: Broadcasting large task binary with size 1457.9 KiB
24/08/20 15:33:24 WARN DAGScheduler: Broadcasting large task binary with size 1459.3 KiB
24/08/20 15:33:24 WAR

Root-Mean-Square Error (RMSE): 4.024470392913011


24/08/20 15:33:26 WARN DAGScheduler: Broadcasting large task binary with size 1488.7 KiB
24/08/20 15:33:27 WARN DAGScheduler: Broadcasting large task binary with size 1537.9 KiB


Mean Absolute Error (MAE): 2.703624600756272


In [25]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RecommendationModel") \
    .getOrCreate()

# Load data from JSON file
input_file = "/home/lplab/Documents/220962067/movies 1.json"  # Update this path to your JSON file
df = spark.read.json(input_file)

# Select relevant columns for the recommendation model
df = df.select(col("user_id"), col("product_id"), col("score"))

# Convert user_id and product_id to integer index (StringIndexer or a similar approach)
from pyspark.ml.feature import StringIndexer

# Indexing user_id
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index")
user_indexed_df = user_indexer.fit(df).transform(df)

# Indexing product_id
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index")
product_indexed_df = product_indexer.fit(user_indexed_df).transform(user_indexed_df)

# Prepare data for ALS model
als_data = product_indexed_df.select(col("user_index").alias("user"), col("product_index").alias("item"), col("score"))

# Split data into training and test sets
(training_data, test_data) = als_data.randomSplit([0.8, 0.2], seed=42)

# Create and train ALS model
als = ALS(
    maxIter=10,
    regParam=0.01,
    userCol="user",
    itemCol="item",
    ratingCol="score",
    coldStartStrategy="drop"
)

model = als.fit(training_data)

# Evaluate the model on test data
predictions = model.transform(test_data)

# Show some sample predictions
predictions.show()



# Stop the Spark session
spark.stop()


24/08/20 15:38:10 WARN DAGScheduler: Broadcasting large task binary with size 1445.3 KiB
24/08/20 15:38:10 WARN DAGScheduler: Broadcasting large task binary with size 1447.6 KiB
24/08/20 15:38:10 WARN DAGScheduler: Broadcasting large task binary with size 1449.1 KiB
24/08/20 15:38:11 WARN DAGScheduler: Broadcasting large task binary with size 1450.4 KiB
24/08/20 15:38:11 WARN DAGScheduler: Broadcasting large task binary with size 1449.4 KiB
24/08/20 15:38:11 WARN DAGScheduler: Broadcasting large task binary with size 1450.7 KiB
24/08/20 15:38:11 WARN DAGScheduler: Broadcasting large task binary with size 1451.5 KiB
24/08/20 15:38:11 WARN DAGScheduler: Broadcasting large task binary with size 1454.5 KiB
24/08/20 15:38:11 WARN DAGScheduler: Broadcasting large task binary with size 1455.9 KiB
24/08/20 15:38:11 WARN DAGScheduler: Broadcasting large task binary with size 1457.3 KiB
24/08/20 15:38:11 WARN DAGScheduler: Broadcasting large task binary with size 1458.7 KiB
24/08/20 15:38:11 WAR

+------+-----+-----+-----------+
|  user| item|score| prediction|
+------+-----+-----+-----------+
| 833.0| 83.0|  4.0|   1.363375|
|2366.0| 21.0|  4.0|   3.927279|
|3749.0| 21.0|  3.0|  2.9454594|
|1395.0| 21.0|  5.0|-0.44741273|
|5433.0| 37.0|  5.0|-0.40601954|
|3306.0| 37.0|  5.0| 0.90569335|
| 322.0| 85.0|  4.0|  4.2532935|
|5920.0| 21.0|  4.0|   3.927279|
|4798.0| 21.0|  5.0|   4.909099|
|4452.0| 85.0|  4.0| 0.60977006|
| 193.0| 83.0|  5.0|   6.816571|
|4263.0|  7.0|  2.0|  0.2378944|
|5417.0| 21.0|  5.0|   4.909099|
| 126.0|768.0|  1.0|  10.330318|
|4683.0| 21.0|  4.0|   3.927279|
| 183.0|431.0|  3.0| -3.2916005|
|1415.0| 83.0|  4.0|   2.671285|
| 723.0|163.0|  4.0|   4.106262|
| 914.0| 78.0|  4.0| -1.3822641|
|  27.0| 85.0|  2.0|    2.43526|
+------+-----+-----+-----------+
only showing top 20 rows

