In [13]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("restaurant-review-average")
sc = SparkContext(conf=conf)

sc

In [12]:
sc.stop()

In [16]:
filepath = "/home/ubuntu/working/spark/data/restaurant_reviews.csv"

In [18]:
!pwd # /home/ubuntu/working/spark + data 가 돼야 한다...

/home/ubuntu/working/spark


In [19]:
!ls data/

restaurant_reviews.csv	xAPI-Edu-Data.csv


In [17]:
lines = sc.textFile(f"file:///{filepath}")
lines.collect()

                                                                                

['id,item,cateogry,reviews,',
 '0,짜장면,중식,125,',
 '1,짬뽕,중식,235,',
 '2,김밥,분식,32,',
 '3,떡볶이,분식,534,',
 '4,라멘,일식,223,',
 '5,돈가스,일식,52,',
 '6,우동,일식,12,',
 '7,쌀국수,아시안,312,',
 '8,햄버거,패스트푸드,12,',
 '9,치킨,패스트푸드,23']

In [27]:
# header 제거하기

header = lines.first()
rows = lines.filter(lambda row : row!= header)
rows

PythonRDD[11] at RDD at PythonRDD.scala:53

In [28]:
rows.collect()

['0,짜장면,중식,125,',
 '1,짬뽕,중식,235,',
 '2,김밥,분식,32,',
 '3,떡볶이,분식,534,',
 '4,라멘,일식,223,',
 '5,돈가스,일식,52,',
 '6,우동,일식,12,',
 '7,쌀국수,아시안,312,',
 '8,햄버거,패스트푸드,12,',
 '9,치킨,패스트푸드,23']

In [75]:
# map을 활용해 각 행을 파싱하기 위한 TASK(함수)
def parse(row):
    fields = row.split(",")
    
    # Category
    category = fields[2]
    
    # 리뷰 개수
    review_count = fields[3]
    review_count = int(review_count)
    
    return category, review_count

In [76]:
# 잘 되는지 테스트
parse('5,돈가스,일식,52,')

('일식', 52)

In [84]:
# RDD 내의 모든 원소(row)에 대해 parse 함수를 적용하고 적용된 결과를 추출 -> map
category_reviews = rows.map(parse)
category_reviews

PythonRDD[19] at RDD at PythonRDD.scala:53

In [85]:
category_reviews.collect()

[('중식', 125),
 ('중식', 235),
 ('분식', 32),
 ('분식', 534),
 ('일식', 223),
 ('일식', 52),
 ('일식', 12),
 ('아시안', 312),
 ('패스트푸드', 12),
 ('패스트푸드', 23)]

In [90]:
category_reviews = category_reviews.map(lambda x : (x[0],(x[1],1)))
category_reviews

PythonRDD[22] at RDD at PythonRDD.scala:53

In [92]:
category_reviews_map.collect()

[('중식', (125, 1)),
 ('중식', (235, 1)),
 ('분식', (32, 1)),
 ('분식', (534, 1)),
 ('일식', (223, 1)),
 ('일식', (52, 1)),
 ('일식', (12, 1)),
 ('아시안', (312, 1)),
 ('패스트푸드', (12, 1)),
 ('패스트푸드', (23, 1))]

In [93]:
# 0번째 값이 키의 역할을 한다
# 반드시 RDD 내에서 작업을 해줘야 한다
category_reviews_count = category_reviews.mapValues(lambda review_count : (review_count, 1))

In [94]:
category_reviews_count.collect()

[('중식', (125, 1)),
 ('중식', (235, 1)),
 ('분식', (32, 1)),
 ('분식', (534, 1)),
 ('일식', (223, 1)),
 ('일식', (52, 1)),
 ('일식', (12, 1)),
 ('아시안', (312, 1)),
 ('패스트푸드', (12, 1)),
 ('패스트푸드', (23, 1))]

In [101]:
# 누적 연산

reduce = category_reviews_count.reduceByKey(lambda x,y : (x[0]+y[0],x[1]+y[1]))  
# group by 처럼 줄이는 것 같다.

In [103]:
reduce.collect()

[('중식', (360, 2)),
 ('분식', (566, 2)),
 ('일식', (287, 3)),
 ('아시안', (312, 1)),
 ('패스트푸드', (35, 2))]

In [104]:
average = reduce.mapValues(lambda v : v[0] / v[1])
average

PythonRDD[33] at RDD at PythonRDD.scala:53

In [106]:
average.collect()

[('중식', 180.0),
 ('분식', 283.0),
 ('일식', 95.66666666666667),
 ('아시안', 312.0),
 ('패스트푸드', 17.5)]

In [107]:
sc.stop()