# ETL for Reviews

In [1]:
import petl as etl

# Extract

In [2]:
filename= 'yelp_academic_dataset_review_drop_PA.json'

In [3]:
source = etl.sources.ZipSource('Pittsburgh_json.zip','yelp_academic_dataset_review_drop_PA.json') # where is the data
t1 = etl.fromjson(source)

In [4]:
fields = t1.fieldnames()

for f in fields:
    print f,'\t', t1.typecounter(f)

funny 	Counter({'int': 179774})
user_id 	Counter({'unicode': 179774})
review_id 	Counter({'unicode': 179774})
business_id 	Counter({'unicode': 179774})
stars 	Counter({'int': 179774})
date 	Counter({'unicode': 179774})
useful 	Counter({'int': 179774})
type 	Counter({'unicode': 179774})
cool 	Counter({'int': 179774})


Due to the current types of data, we need to do the following:
 1. Add ascending numbers to the rows and call it "review_id" (this will be used for the auto-increamented review_ID field).
 2. Aggragate the columns `funny`, `cool` and `useful` to the "votes_aggregate" column.
 3. Change the user_id, business_id, date_id to their Foreign keys (in the DB). This will be done during the Load.

In [5]:
t2 = t1.addrownumbers()
t2.display(10)

row,funny,user_id,review_id,business_id,stars,date,useful,type,cool
1,0,hzw-qTUVpmLAKjdkoUNh8A,Awq_6cyNjK1-qPZAwnXjjQ,7p6tHUA1Pknh0DVWqz86lA,1,2016-08-27,0,review,0
2,0,mldKxVI59o3LhK3ITG6mnA,96YkAuJzlT54qZZWNebFUg,7p6tHUA1Pknh0DVWqz86lA,5,2015-06-15,0,review,0
3,0,SaedHW9i7k4lHR8tgwtMgQ,OfZRG7RgKA118zDtj6yo-g,7p6tHUA1Pknh0DVWqz86lA,5,2014-01-16,0,review,0
4,0,87CKG39VfXYRupM3VRfReg,YEgNOZDCeLuQimfNVYC2AA,7p6tHUA1Pknh0DVWqz86lA,1,2013-10-09,2,review,0
5,0,kESRYcaODjB6s9p1-alBTw,tEBhqaLMYBvxhkdxr1Pr7g,7p6tHUA1Pknh0DVWqz86lA,5,2015-02-25,1,review,1
6,0,07XUD-8jPwtpgMGYT_veJw,HvlhVHFsjqr1SXiJLqPtNA,7p6tHUA1Pknh0DVWqz86lA,1,2013-01-03,9,review,0
7,0,-9YbyjrujLmFVORF3PA0YQ,S-JbWPzCCxgP288Ao-_qYQ,95s7ZRceq-mYD-a7DZntnQ,1,2015-08-04,0,review,0
8,0,V6IBbUN_bBfzd5s8LlLTlw,L2eNJp3tYOoM9Kr0ouDztg,95s7ZRceq-mYD-a7DZntnQ,1,2014-12-08,1,review,0
9,1,-UPUYET3Pwm99zq-uHl7gg,OPBHCs-lcEVWbEZY7rKNUg,0Wy4gw8krao9nGq-sHWFSQ,3,2013-12-01,1,review,1
10,0,WLrFY6_Z32lb_ifcIvRvkw,SjgayD75MU7yTWEs_cGbwg,0Wy4gw8krao9nGq-sHWFSQ,3,2013-05-06,0,review,0


In [8]:
t3 = etl.rename(t2, 'review_id', 'yelp_review_id')

t4 = etl.rename(t3, 'row', 'review_id')

In [9]:
def votes_aggregate(row):
    return row['funny']+row['cool']+row['useful']
t5=t4.addfield('votes_aggregate', votes_aggregate)
t5.display(10)

review_id,funny,user_id,yelp_review_id,business_id,stars,date,useful,type,cool,votes_aggregate
1,0,hzw-qTUVpmLAKjdkoUNh8A,Awq_6cyNjK1-qPZAwnXjjQ,7p6tHUA1Pknh0DVWqz86lA,1,2016-08-27,0,review,0,0
2,0,mldKxVI59o3LhK3ITG6mnA,96YkAuJzlT54qZZWNebFUg,7p6tHUA1Pknh0DVWqz86lA,5,2015-06-15,0,review,0,0
3,0,SaedHW9i7k4lHR8tgwtMgQ,OfZRG7RgKA118zDtj6yo-g,7p6tHUA1Pknh0DVWqz86lA,5,2014-01-16,0,review,0,0
4,0,87CKG39VfXYRupM3VRfReg,YEgNOZDCeLuQimfNVYC2AA,7p6tHUA1Pknh0DVWqz86lA,1,2013-10-09,2,review,0,2
5,0,kESRYcaODjB6s9p1-alBTw,tEBhqaLMYBvxhkdxr1Pr7g,7p6tHUA1Pknh0DVWqz86lA,5,2015-02-25,1,review,1,2
6,0,07XUD-8jPwtpgMGYT_veJw,HvlhVHFsjqr1SXiJLqPtNA,7p6tHUA1Pknh0DVWqz86lA,1,2013-01-03,9,review,0,9
7,0,-9YbyjrujLmFVORF3PA0YQ,S-JbWPzCCxgP288Ao-_qYQ,95s7ZRceq-mYD-a7DZntnQ,1,2015-08-04,0,review,0,0
8,0,V6IBbUN_bBfzd5s8LlLTlw,L2eNJp3tYOoM9Kr0ouDztg,95s7ZRceq-mYD-a7DZntnQ,1,2014-12-08,1,review,0,1
9,1,-UPUYET3Pwm99zq-uHl7gg,OPBHCs-lcEVWbEZY7rKNUg,0Wy4gw8krao9nGq-sHWFSQ,3,2013-12-01,1,review,1,3
10,0,WLrFY6_Z32lb_ifcIvRvkw,SjgayD75MU7yTWEs_cGbwg,0Wy4gw8krao9nGq-sHWFSQ,3,2013-05-06,0,review,0,0


In [None]:
out_facts_reviews = t3.cut(['review_id', 'user_id', ''])