In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

file_path = '/content/drive/MyDrive/Recommender/Beer_taste_Reviews_1M_trainingset.csv'
df = pd.read_csv(file_path)


In [4]:
df.shape

(1047592, 10)

In [6]:
df = df[['review_profilename', 'beer_beerid', 'review_taste']]
df.shape



(1047592, 3)

In [7]:
df.dropna(inplace=True)
df.shape

(1047357, 3)

In [8]:
df.head(2)

Unnamed: 0,review_profilename,beer_beerid,review_taste
0,7,245,4.0
1,187,47364,5.0


In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode user and beer IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['review_profilename'])
df['beer_id'] = item_encoder.fit_transform(df['beer_beerid'])


In [10]:
df.head(2)

Unnamed: 0,review_profilename,beer_beerid,review_taste,user_id,beer_id
0,7,245,4.0,171,129
1,187,47364,5.0,40,24810


In [None]:
!pip install pyspark

In [13]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ALSRecommendation").getOrCreate()

# Convert pandas dataframe to Spark dataframe
spark_df = spark.createDataFrame(df[['user_id', 'beer_id', 'review_taste']])

# Create ALS model
als = ALS(
    userCol='user_id',
    itemCol='beer_id',
    ratingCol='review_taste',
    implicitPrefs=False,  # Set to True for implicit feedback datasets
    coldStartStrategy='drop'  # Avoids issues with NaN predictions
)

# Train-test split
train, test = spark_df.randomSplit([0.8, 0.2], seed=42)

# Fit the ALS model
model = als.fit(train)


In [14]:
predictions = model.transform(test)
predictions.show()


+-------+-------+------------+----------+
|user_id|beer_id|review_taste|prediction|
+-------+-------+------------+----------+
|    148|    330|         4.0| 3.8447661|
|    148|    481|         4.0| 3.2538626|
|    148|   1130|         4.0| 3.7304356|
|    148|   1847|         4.0| 3.7840972|
|    463|    646|         2.5| 3.3881428|
|   1238|   8749|         4.5|  4.417529|
|   1238|   9311|         4.0|  4.155345|
|   1238|  10132|         5.0| 4.5428843|
|   1238|  16564|         4.5|  4.273036|
|   1238|  17497|         3.0|  4.091258|
|   1238|  20762|         4.0| 3.3804183|
|   1238|  27751|         3.5| 4.0186415|
|   1238|  28211|         4.5|  4.234785|
|   1238|  28568|         4.5| 3.3452659|
|   1238|  32415|         5.0| 4.3562803|
|   1238|  32448|         3.5| 3.9302983|
|   1238|  35504|         4.0| 3.8479834|
|   1342|    139|         3.5| 3.9277146|
|   1342|   3355|         4.0|  4.313206|
|   1580|     19|         4.5| 4.6497993|
+-------+-------+------------+----

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="review_taste",
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")


Root-mean-square error = 0.5764026739564645


In [16]:
# Recommend top 5 beers for each user
user_recommendations = model.recommendForAllUsers(5)
user_recommendations.show()


+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      1|[{17071, 5.493697...|
|      3|[{17071, 5.505174...|
|      5|[{18148, 6.018256...|
|      6|[{19265, 5.092736...|
|      9|[{18148, 4.786301...|
|     12|[{18148, 5.351186...|
|     13|[{18148, 4.662886...|
|     15|[{17071, 5.862620...|
|     16|[{18176, 5.933226...|
|     17|[{17071, 6.13645}...|
|     19|[{17071, 5.906768...|
|     20|[{17071, 3.122571...|
|     22|[{32236, 5.545805...|
|     26|[{17071, 4.671387...|
|     27|[{18148, 4.998515...|
|     28|[{17071, 5.455913...|
|     31|[{17071, 5.751196...|
|     34|[{1806, 5.83587},...|
|     35|[{9248, 5.4323974...|
|     37|[{17071, 5.421525...|
+-------+--------------------+
only showing top 20 rows



In [None]:
# Example parameter tuning
als = ALS(userCol='user_id', itemCol='beer_id', ratingCol='review_taste',
          maxIter=10, regParam=0.1, rank=10)