In [1]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics

import numpy as np
import pandas as pd
from itertools import islice
from collections import defaultdict

In [2]:
def parseline(line):
    fields = line.split(',')
    userId = fields[0]
    movieId = fields[1]
    rating = fields[2]
    return (userId, movieId, rating)
    
# pyspark set-up
sc.setCheckpointDir('checkpoint')

# Build rating object for ALS 
print("\nLoading data...")


Loading data...


In [3]:
lines = sc.textFile("res/sample/ratings.csv")
parsedlines = lines.map(parseline)
header = parsedlines.first()

#filter out the header, make sure the rest looks correct
parsedlines = parsedlines.filter(lambda line: line != header)

ratings = parsedlines.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])) ).cache()
print(f"Ratings : \n{ratings.take(3)}")

Ratings : 
[Rating(user=1, product=1, rating=4.0), Rating(user=1, product=3, rating=4.0), Rating(user=1, product=6, rating=4.0)]


In [4]:
lines1 = sc.textFile("res/sample/test.csv")
parsedlines1 = lines1.map(parseline)
header1 = parsedlines1.first()

#filter out the header, make sure the rest looks correct
parsedlines1 = parsedlines1.filter(lambda line: line != header1)

ratings1 = parsedlines1.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])) ).cache()
print(f"test Ratings : \n{ratings1.take(3)}")

test Ratings : 
[Rating(user=0, product=1, rating=307.0), Rating(user=1, product=1, rating=1590.0), Rating(user=2, product=1, rating=3424.0)]


In [5]:
testset = ratings1.map(lambda t: (t[0], t[1]))
print(f"Testset : \n{testset.take(3)}")

Testset : 
[(0, 1), (1, 1), (2, 1)]


In [9]:
# Build the recommendation model using Alternating Least Squares
print("\nTraining recommendation model...")
rank = 5
numIterations = 20
model = ALS.train(ratings, rank, numIterations)

predictions = model.predictAll(testset).collect()
#print(f"Predictions : \n{predictions}")

embedding = model.userFeatures().collect()
print("\nDone!")


Training recommendation model...

Done!


In [10]:
pred_df = pd.DataFrame(predictions)
pred_df.head()
#pred_df = pred_df[['user','product', 'title', 'rating']]
#pred_df.to_csv("als_predictions.csv", index=False, header = True)

Unnamed: 0,user,product,rating
0,160,5,2.759478
1,560,42,2.530827
2,240,14,3.895874
3,400,26,3.897675
4,480,36,4.219032
