In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .config('spark.driver.memory', '15g') \
    .appName('cafe-build-recommender') \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
import os
datasets_path = os.path.join('..', 'data')

In [3]:
rating_file = os.path.join(datasets_path, 'rating.csv')
ratings_raw_data = sc.textFile(rating_file)
ratings_raw_data_header = ratings_raw_data.take(1)[0]

In [4]:
ratings_data = ratings_raw_data.filter(lambda line: line != ratings_raw_data_header)\
    .map(lambda line: line.split(","))\
    .map(lambda tokens: (tokens[1], tokens[0].replace("-", ""), tokens[2])).cache()

In [5]:
ratings_data.take(3)

[('1', '5610000104201900272', '10'),
 ('1', '5610000104201900212', '10'),
 ('1', '3770000104201900160', '10')]

데이터순서: (유저ID, 카페ID, LIKE)
(3760000-104-2019-00185) 이런식의 카페 ID 에서 -를 삭제해서 사용한다.
(7)-(3)-(4)-(5) 형태의 총 19 자리의 관리번호.

In [16]:
cafe_file = os.path.join(datasets_path, 'data.csv')

cafe_raw_data = sc.textFile(cafe_file)
cafe_raw_data_header = cafe_raw_data.take(1)[0]

headerList = str(cafe_raw_data_header).split(",") # 관리번호 = PK
need_column = ['관리번호', '영업상태구분코드', '영업상태명', '소재지전화',\
        '소재지면적','소재지전체주소', '도로명전체주소', '사업장명', '최종수정시점',\
        '업태구분명', '좌표정보(X)', '좌표정보(Y)', '시설총규모', '홈페이지']
need_column_index = []
column_list = []
for i, value in enumerate(headerList):
    if str(value) in need_column:
        need_column_index.append(i)
        column_list.append(str(value))

cafe_data = cafe_raw_data.filter(lambda line: line!=cafe_raw_data_header) \
    .map(lambda line: line.split(",")) \
    .filter(lambda tokens: tokens[24] == '커피숍' and tokens[7] =='영업/정상')\
    .map(lambda tokens: [tokens[i] for i in need_column_index]) \
    .filter(lambda tokens: tokens[5].split()[0] == '경기도' or tokens[5].split()[0] == '충청북도')\
    .map(lambda tokens: (tokens[0].replace("-", ""), tokens[7]))

cafe_data.take(3)

[('5610000104201800133', '커피에반하다영통구청2호점'),
 ('3770000104201900191', '시에커피(sie coffee)'),
 ('3760000104201900185', '날쌘카페(탑동점)')]

여기까지 데이터 세팅 완료

In [17]:
###### cafe 데이터 integer mapping #######

ratings_with_unique_id = ratings_data.zipWithUniqueId()
print(ratings_with_unique_id.take(10))
ratings_data = ratings_with_unique_id.map(lambda tokens: (tokens[0][0], tokens[1], tokens[0][2]))
ratings_data.take(3)

[(('1', 0, '10'), 0), (('1', 2, '10'), 2), (('1', 4, '10'), 4), (('1', 6, '10'), 6), (('1', 8, '10'), 8), (('1', 10, '10'), 10), (('1', 12, '10'), 12), (('1', 14, '10'), 14), (('1', 16, '10'), 16), (('1', 18, '10'), 18)]


[('1', 0, '10'), ('1', 2, '10'), ('1', 4, '10')]

integer_map_list << ALS 에서 training_RDD 의 값이 integer 만 사용가능해서 mapping 한 데이터를 사용해서 ratings 의 값들을 integer 값으로 변경.

In [18]:
training_RDD, validation_RDD, test_RDD = ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x:(x[0], x[1]))

In [25]:
training_RDD.take(2)

[('1', 0, '10'), ('1', 2, '10')]

In [None]:
from pyspark.mllib.recommendation import ALS
import math

seed = 10
iterations = 10
regularization_parameter = 0.1
ranks = [1, 10, 100, 1000]
errors = [0, 0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print('The best model was trained with rank %s' % best_rank)

For rank 1 the RMSE is 0.0
For rank 10 the RMSE is 0.0


In [30]:
print(predictions.take(3))
print(rates_and_preds.take(3))

[]
[]


이제 유저의 레이팅을 추가해보자.

In [21]:
new_user_ID = 0

# The format of each line is (userID, movieID, rating)
new_user_ratings = [
     (0,260,10),
     (0,24,10),
     (0,100,10),
     (0,682,10),
     (0,10,10),
     (0,4,10),
     (0,12,10),
     (0,492,10),
     (0,795,10),
     (0,2,10)
    ]
new_user_ratings_RDD = sc.parallelize(new_user_ratings)
print('New user ratings: %s' % new_user_ratings_RDD.take(10))

New user ratings: [(0, 260, 10), (0, 24, 10), (0, 100, 10), (0, 682, 10), (0, 10, 10), (0, 4, 10), (0, 12, 10), (0, 492, 10), (0, 795, 10), (0, 2, 10)]
