# 1. Creating the Scheme in the Local Host

### Create MYSQL connector

In [1]:
import MySQLdb as mdb

In [2]:
# creating connection
con = mdb.connect(
               host = '127.0.0.1', user = 'root', passwd = 'root', use_unicode=True, charset="utf8")

In [3]:
# setting a cursor
cursor = con.cursor()     # get the cursor

### DDL - Create Schema

In [4]:
cursor.execute("DROP SCHEMA IF EXISTS Yelp_Pittsburgh ;")
cursor.execute("CREATE SCHEMA IF NOT EXISTS Yelp_Pittsburgh ;")
cursor.execute("USE Yelp_Pittsburgh ;")
cursor.execute('SET SQL_MODE=ANSI_QUOTES') #important for petl operation - use this kind of quotes



0L

### Create Tables

In [5]:
#dim_users table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS dim_users (
        `user_id` int(11) NOT NULL AUTO_INCREMENT,
        `yelp_user_id` VARCHAR(20) NOT NULL,
        `review_count` INT(11),
        `fans` INT(11),
        `is_elite` BOOLEAN NOT NULL,
        `yelping_since` DATE,
        `friends_count` INT(11) NOT NULL,
        PRIMARY KEY (`user_id`));
        """)

0L

In [6]:
#dim_business table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS dim_business (       
        `business_id` INT(11) NOT NULL AUTO_INCREMENT,
        `Yelp_business_id` VARCHAR(20) NOT NULL,
        `business_name` VARCHAR(20) NOT NULL,
        `is_open` BOOLEAN NOT NULL,
        `neighborhood` VARCHAR(20),
        `latitude` FLOAT(11),
        `longitude` FLOAT(11),
        `stars` FLOAT(11),
        `review_count` INT(11) NOT NULL,
        `food` BOOLEAN NOT NULL,
        `art & enteraitment` BOOLEAN NOT NULL,
        `stores` BOOLEAN NOT NULL,
        `beauty & spa` BOOLEAN NOT NULL,
        `health` BOOLEAN NOT NULL,
        `finance` BOOLEAN NOT NULL,
        `turists` BOOLEAN NOT NULL,
        `cars & transportation` BOOLEAN NOT NULL,
        `bars & alcohol` BOOLEAN NOT NULL,
        `other` BOOLEAN NOT NULL,
        `fashion` BOOLEAN NOT NULL,
        `real estate` BOOLEAN NOT NULL,
        
        PRIMARY KEY (`business_id`));
        """)

0L

In [7]:
#dim_dates table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS dim_dates (       
        `date_id` INT(11) NOT NULL AUTO_INCREMENT,
        `date` DATE NOT NULL,
        `day` INT(11) NOT NULL,
        `month` INT(11) NOT NULL,
        `year` INT(11),
        `day_name` VARCHAR(45) NOT NULL,
        `holiday` VARCHAR(45),
        PRIMARY KEY (`date_id`));
        """)

0L

In [8]:
#fact_review table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS fact_reviews (
        `review_id` INT(11) NOT NULL AUTO_INCREMENT,
        `yelp_review_id` VARCHAR(20) NOT NULL,
        `user_id` INT(11) NOT NULL,
        `business_id` INT(11) NOT NULL,
        `date_id` int(11) NOT NULL,
        `stars` FLOAT(11),
        `votes_aggregate` INT(11),
        PRIMARY KEY (`review_id`));
        """)

0L

In [9]:
#commit changes
con.commit()

### Closing the connection

In [10]:
#closing the connection
cursor.close()
con.close()

# 2. ETL

In [11]:
import petl as etl
import datetime

## Users

### Extract

In [12]:
# Extract JSON file and display
filename= 'yelp_academic_dataset_users_nofriendlist_PA.json'
t1 = etl.fromjson(filename)
t1.display(10)

yelping_since,useful,compliment_photos,compliment_list,compliment_funny,compliment_plain,user_id,fans,type,compliment_note,funny,compliment_writer,compliment_cute,average_stars,review_count,compliment_more,elite,compliment_hot,cool,name,compliment_profile,compliment_cool
2007-12-10,565,20,10,251,138,v10dHdqf42tUkxbYVnFW6w,53,user,194,987,39,21,3.23,591,17,"[u'2009', u'2008', u'2010', u'2011']",191,743,Chris,5,251
2011-03-27,324,121,10,222,171,UrOofvcBl0gAbKPAEaj83A,37,user,175,312,172,11,3.91,338,38,"[u'2014', u'2015']",213,321,Louise,25,222
2012-11-22,1,0,0,0,0,-ONyyxFUhDatBG3vw6PkUw,0,user,0,0,0,0,3.73,11,0,[u'None'],0,0,Kaitlin,0,0
2010-03-29,1143,12,1,194,173,p03AKnMXXbgetIEHHFP3wQ,43,user,77,481,84,1,3.54,359,8,"[u'2017', u'2012', u'2013', u'2015', u'2016', u'2014']",140,379,Heather,2,194
2010-07-12,66,0,0,0,1,7rKWM_1lPM4oGdXegFjp9g,3,user,1,8,0,0,3.9,42,0,[u'None'],0,11,Angelica,0,0
2012-09-03,4,0,0,2,0,57vdnMXfheEeiVd_kzxUlg,2,user,0,0,1,0,3.81,40,0,[u'None'],3,0,Amber,0,2
2008-06-05,440,1,1,112,45,6DbvYyAvTtXTmVOAG32Djw,9,user,15,469,21,0,4.04,188,7,"[u'2011', u'2010']",46,470,Sharon,2,112
2010-06-12,99,0,0,10,5,3vYsjC90S80legSDDTmqcA,10,user,3,28,8,0,3.97,71,0,"[u'2013', u'2012']",8,12,Donna,0,10
2009-03-30,65,1,1,13,15,CFyvX-AZh6cunAMu_zoF_w,8,user,8,21,10,0,3.61,167,3,"[u'2011', u'2012', u'2013', u'2014', u'2010']",7,3,Rosalie,0,13
2012-07-24,7,0,0,0,0,cYXznfbPK_ShlepnUiLpcw,1,user,0,1,1,0,3.94,14,0,[u'None'],1,0,Je'Amour,0,0


In [13]:
'''creating the users dimension:
user_id= varchar(20)
friends_count= INT
review_count= INT
fans= INT
is_elite= Binary
yelping_since= Date'''

t2 = t1.cut(['user_id','review_count','fans','elite','yelping_since'])
t2.display(10)

user_id,review_count,fans,elite,yelping_since
v10dHdqf42tUkxbYVnFW6w,591,53,"[u'2009', u'2008', u'2010', u'2011']",2007-12-10
UrOofvcBl0gAbKPAEaj83A,338,37,"[u'2014', u'2015']",2011-03-27
-ONyyxFUhDatBG3vw6PkUw,11,0,[u'None'],2012-11-22
p03AKnMXXbgetIEHHFP3wQ,359,43,"[u'2017', u'2012', u'2013', u'2015', u'2016', u'2014']",2010-03-29
7rKWM_1lPM4oGdXegFjp9g,42,3,[u'None'],2010-07-12
57vdnMXfheEeiVd_kzxUlg,40,2,[u'None'],2012-09-03
6DbvYyAvTtXTmVOAG32Djw,188,9,"[u'2011', u'2010']",2008-06-05
3vYsjC90S80legSDDTmqcA,71,10,"[u'2013', u'2012']",2010-06-12
CFyvX-AZh6cunAMu_zoF_w,167,8,"[u'2011', u'2012', u'2013', u'2014', u'2010']",2009-03-30
cYXznfbPK_ShlepnUiLpcw,14,1,[u'None'],2012-07-24


#### Transform

In [14]:
fields = t2.fieldnames()
for f in fields:
    print f,'\t', t2.typecounter(f)

user_id 	Counter({'unicode': 52141})
review_count 	Counter({'int': 52141})
fans 	Counter({'int': 52141})
elite 	Counter({'unicode': 52141})
yelping_since 	Counter({'unicode': 52141})


In [15]:
#convet unicode type to binary (creating is_elite field)
def to_binary(text):
    if text[3:7] == 'None':
        return 0 
    return 1

t3 = t2.convert('elite' , to_binary)
t3.display(10)

user_id,review_count,fans,elite,yelping_since
v10dHdqf42tUkxbYVnFW6w,591,53,1,2007-12-10
UrOofvcBl0gAbKPAEaj83A,338,37,1,2011-03-27
-ONyyxFUhDatBG3vw6PkUw,11,0,0,2012-11-22
p03AKnMXXbgetIEHHFP3wQ,359,43,1,2010-03-29
7rKWM_1lPM4oGdXegFjp9g,42,3,0,2010-07-12
57vdnMXfheEeiVd_kzxUlg,40,2,0,2012-09-03
6DbvYyAvTtXTmVOAG32Djw,188,9,1,2008-06-05
3vYsjC90S80legSDDTmqcA,71,10,1,2010-06-12
CFyvX-AZh6cunAMu_zoF_w,167,8,1,2009-03-30
cYXznfbPK_ShlepnUiLpcw,14,1,0,2012-07-24


In [16]:
#build the friends_count column by join (users and friends)

source = 'Pittsburgh_full_friend_text.json'
t4 = etl.fromjson(source)
t5 = t3.join(t4, # right table
                   lkey='user_id',rkey='user_id', #join equality columns
                   rprefix='t4_') # prefixes of columns from each table (not mandatory)

def friend_count(text):
    if text[0]=='[' and text[-1]==']':
        items =  text[1:-1].split(', ') # turn values to list
        if text[3:7]== 'None':
            return 0
        return len(items)

t6 = t5.convert('t4_friends' , friend_count)
t6.display(10)

user_id,review_count,fans,elite,yelping_since,t4_friends
--26jc8nCJBy4-7r3ZtmiQ,2,0,0,2014-08-03,0
--6CV8BPNofy7jt1JavD-g,32,2,0,2011-08-29,1
--8EvC9O7Ycim8vCP4Cj0w,4,0,0,2014-08-14,33
--CH8yRGXhO2MmbF-4BWXg,12,0,0,2011-07-18,0
--EMqnd727rtC0G5Oc-Mrg,28,0,0,2008-08-05,1
--OECAoqfSTBaZ3biOyzwA,4,0,0,2016-09-20,0
--RBfYfIpx44V5Kux2fPFA,11,0,0,2013-06-24,0
--TvGNywm2I1iwNWZmerBA,1,0,0,2012-04-27,0
--Yrq6JLmQNdbZkWJsHy5Q,145,8,1,2010-03-12,61
--_WHb1Fw3fH6RXbxpExgA,1,0,0,2013-01-14,1


In [17]:
# adding new user ID and rename the table

t7 = t6.addrownumbers()
t8 = t7.rename({'row':'user_id','user_id':'yelp_user_id','elite':'is_elite','t4_friends':'friends_count'})
t8.display(10)

user_id,yelp_user_id,review_count,fans,is_elite,yelping_since,friends_count
1,--26jc8nCJBy4-7r3ZtmiQ,2,0,0,2014-08-03,0
2,--6CV8BPNofy7jt1JavD-g,32,2,0,2011-08-29,1
3,--8EvC9O7Ycim8vCP4Cj0w,4,0,0,2014-08-14,33
4,--CH8yRGXhO2MmbF-4BWXg,12,0,0,2011-07-18,0
5,--EMqnd727rtC0G5Oc-Mrg,28,0,0,2008-08-05,1
6,--OECAoqfSTBaZ3biOyzwA,4,0,0,2016-09-20,0
7,--RBfYfIpx44V5Kux2fPFA,11,0,0,2013-06-24,0
8,--TvGNywm2I1iwNWZmerBA,1,0,0,2012-04-27,0
9,--Yrq6JLmQNdbZkWJsHy5Q,145,8,1,2010-03-12,61
10,--_WHb1Fw3fH6RXbxpExgA,1,0,0,2013-01-14,1


In [18]:
#Validate the fields type
fields = t8.fieldnames()
for f in fields:
    print f,'\t', t8.typecounter(f)

user_id 	Counter({'int': 52141})
yelp_user_id 	Counter({'unicode': 52141})
review_count 	Counter({'int': 52141})
fans 	Counter({'int': 52141})
is_elite 	Counter({'int': 52141})
yelping_since 	Counter({'unicode': 52141})
friends_count 	Counter({'int': 52141})


#### Load

In [19]:
import MySQLdb as mdb

con = mdb.connect(
               host = '127.0.0.1', user = 'root', passwd = 'root', use_unicode=True, charset="utf8") #optional - db="schema_name"  
# setting a cursor
cur = con.cursor()     # get the cursor

In [20]:
# append the data to existing tables
cur.execute('SET SQL_MODE=ANSI_QUOTES')
t8.appenddb(cur,'dim_users',schema='yelp_pittsburgh',commit=True)

  from ipykernel import kernelapp as app
  r = r + self.execute(query, a)


In [21]:
#Closing the Cursor
cur.close()

## Business

### Extract

In [22]:
# Open and display the bussiness json file.
filename= 'yelp_academic_dataset_business_PA.json'
t1 = etl.fromjson(filename)
t1.display(10)

city,review_count,neighborhood,name,business_id,longitude,postal_code,hours,state,is_open,stars,address,latitude,attributes,type,categories
Oakdale,4,,Plush Salon and Spa,cnGIivYRLxpF7tBVR_JwWA,-80.1745398943,15071.0,"[u'Tuesday 10:0-21:0', u'Wednesday 10:0-21:0', u'Thursday 10:0-21:0', u'Friday 10:0-18:0', u'Saturday 9:0-16:0']",PA,1,4.0,7014 Steubenville Pike,40.4445439533,"[u'AcceptsInsurance: False', u'BusinessAcceptsCreditCards: True', u""BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}"", u'ByAppointmentOnly: True', u""HairSpecializesIn: {'coloring': False, 'africanamerican': False, 'curly': False, 'perms': False, 'kids': False, 'extensions': False, 'asian': False, 'straightperms': False}"", u'RestaurantsPriceRange2: 3']",business,"[u'Hair Removal', u'Beauty & Spas', u'Blow Dry/Out Services', u'Hair Stylists', u'Hair Extensions', u'Massage', u'Permanent Makeup', u'Waxing', u'Hair Salons']"
Pittsburgh,8,West View,Benjamin Franklin Plumbing,P3LisOj7DktgGa7C5FYpnA,-80.0498873,15237.0,"[u'Monday 9:30-9:0', u'Tuesday 9:30-9:0', u'Wednesday 9:30-9:0', u'Thursday 9:30-9:0', u'Friday 9:30-9:0', u'Saturday 9:30-9:0', u'Sunday 9:30-9:0']",PA,1,4.0,451 Rochester Rd,40.5341627,[u'BusinessAcceptsCreditCards: True'],business,"[u'Plumbing', u'Home Services']"
Pittsburgh,102,North Side,Rivertowne,93otbGHE0s0m-lU1osvg9w,-80.0108802,15212.0,,PA,1,3.0,337 N Shore Dr,40.4459861,"[u'Alcohol: full_bar', u""Ambience: {'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}"", u""BestNights: {'monday': False, 'tuesday': False, 'friday': True, 'wednesday': False, 'thursday': False, 'sunday': True, 'saturday': True}"", u'BikeParking: True', u'BusinessAcceptsCreditCards: True', u""BusinessParking: {'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}"", u'Caters: False', u'CoatCheck: False', u'GoodForDancing: False', u'GoodForKids: True', u""GoodForMeal: {'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True, 'breakfast': False, 'brunch': False}"", u'HappyHour: True', u'HasTV: True', u""Music: {'dj': False, 'background_music': False, 'no_music': False, 'karaoke': False, 'live': False, 'video': False, 'jukebox': False}"", u'NoiseLevel: loud', u'OutdoorSeating: True', u'RestaurantsAttire: casual', u'RestaurantsDelivery: False', u'RestaurantsGoodForGroups: True', u'RestaurantsPriceRange2: 2', u'RestaurantsReservations: False', u'RestaurantsTableService: True', u'RestaurantsTakeOut: True', u'Smoking: no', u'WheelchairAccessible: True', u'WiFi: free']",business,"[u'American (New)', u'Pubs', u'Nightlife', u'Bars', u'Restaurants']"
Pittsburgh,5,Greentree,City Vista,csdZolWIWvkIHIqsCmV6sg,-80.037872133,15220.0,"[u'Monday 10:0-18:0', u'Tuesday 10:0-18:0', u'Wednesday 10:0-18:0', u'Thursday 10:0-18:0', u'Friday 10:0-18:0', u'Saturday 10:0-17:0', u'Sunday 13:0-17:0']",PA,1,2.0,325 Elizabeth Dr,40.4263863165,"[u'BusinessAcceptsCreditCards: True', u'ByAppointmentOnly: False']",business,"[u'Hotels & Travel', u'Home Services', u'Hotels', u'Apartments', u'Event Planning & Services', u'Real Estate Services', u'Real Estate']"
Etna,38,Etna,The Scarehouse,kdjrQ2tuY4eqo4JZWIx50Q,-79.944321,15223.0,"[u'Thursday 19:0-22:0', u'Friday 19:0-0:0', u'Saturday 19:0-0:0', u'Sunday 19:0-22:0']",PA,1,3.5,118 Locust St,40.499285,[u'GoodForKids: False'],business,"[u'Performing Arts', u'Arts & Entertainment']"
Pittsburgh,5,Bloomfield,Verizon Wireless Premium Retailer,h7ZW6c1Vi9V6_-gd2Ch_4g,-79.946995,15213.0,"[u'Monday 10:0-21:0', u'Tuesday 10:0-21:0', u'Wednesday 10:0-21:0', u'Thursday 10:0-21:0', u'Friday 10:0-21:0', u'Saturday 10:0-21:0', u'Sunday 11:0-17:0']",PA,1,2.0,4815 Centre Ave,40.453725,"[u'BusinessAcceptsCreditCards: True', u""BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}"", u'DogsAllowed: True', u'RestaurantsPriceRange2: 4', u'WheelchairAccessible: True']",business,"[u'Shopping', u'Mobile Phones']"
Bridgeville,8,,My Favorite Sweet Shoppe,CO7TpBuNFr_K0MTRaf9prw,-80.0951019,15017.0,"[u'Monday 10:0-20:0', u'Tuesday 10:0-20:0', u'Wednesday 10:0-20:0', u'Thursday 10:0-20:0', u'Friday 10:0-20:0', u'Saturday 10:0-20:0', u'Sunday 10:0-18:0']",PA,1,5.0,1597 Washington Pike,40.3828646,"[u'BikeParking: False', u'BusinessAcceptsCreditCards: True', u""BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}"", u'DogsAllowed: False', u'RestaurantsPriceRange2: 2', u'RestaurantsTakeOut: True', u'WheelchairAccessible: True']",business,"[u'Chocolatiers & Shops', u'Candy Stores', u'Desserts', u'Gelato', u'Specialty Food', u'Food']"
Pittsburgh,3,Strip District,Dulcinea Bakeshop,HWrbZS1mxVRj2Y2VwMmDMg,-79.9777282029,15222.0,,PA,0,4.5,2627 Penn Ave,40.4553505818,"[u'BikeParking: True', u'BusinessAcceptsCreditCards: True', u""BusinessParking: {'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}"", u'RestaurantsPriceRange2: 1']",business,"[u'Food', u'Bakeries']"
Pittsburgh,145,Oakland,Conflict Kitchen,jDqaz1jKdeWk_fTVCzc1Pg,-79.9527347618,15213.0,"[u'Monday 11:0-19:0', u'Tuesday 11:0-19:0', u'Wednesday 11:0-19:0', u'Thursday 11:0-19:0', u'Friday 11:0-19:0', u'Saturday 11:0-19:0', u'Sunday 11:0-19:0']",PA,1,4.5,221 Schenley Dr,40.44258082,"[u'BikeParking: True', u'BusinessAcceptsCreditCards: True', u""BusinessParking: {'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}"", u'ByAppointmentOnly: False', u""GoodForMeal: {'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'breakfast': False, 'brunch': False}"", u'RestaurantsDelivery: False', u'RestaurantsPriceRange2: 1', u'RestaurantsTakeOut: True', u'WheelchairAccessible: True', u'RestaurantsAttire: casual', u'RestaurantsGoodForGroups: True']",business,"[u'Food', u'Imported Food', u'Restaurants', u'Ethnic Food', u'Specialty Food']"
Pittsburgh,25,South Side,Tusca Restaurant,HYwbHY-srAOoH-t337DpMw,-79.964962,15203.0,"[u'Monday 11:30-21:0', u'Tuesday 11:30-21:0', u'Wednesday 11:30-21:0', u'Thursday 11:30-21:0', u'Friday 11:30-23:0', u'Friday 23:0-2:0', u'Saturday 11:30-23:0', u'Saturday 23:0-2:0', u'Sunday 11:30-21:0']",PA,0,2.0,2773 Sidney St,40.4273611,"[u'Alcohol: full_bar', u'BYOB: False', u'BYOBCorkage: no', u'BusinessAcceptsCreditCards: True', u""BusinessParking: {'garage': True, 'street': True, 'validated': False, 'lot': False, 'valet': False}"", u'GoodForKids: False', u""GoodForMeal: {'dessert': False, 'latenight': False, 'lunch': False, 'dinner': True, 'breakfast': False, 'brunch': False}"", u'HasTV: False', u'OutdoorSeating: True', u'RestaurantsAttire: casual', u'RestaurantsDelivery: False', u'RestaurantsGoodForGroups: True', u'RestaurantsPriceRange2: 3', u'RestaurantsReservations: True', u'RestaurantsTableService: True', u'RestaurantsTakeOut: False', u'WheelchairAccessible: True']",business,"[u'Tapas Bars', u'Restaurants']"


In [23]:
#Check the types of the data
fields = t1.fieldnames()
for f in fields:
    print f,'\t', t1.typecounter(f)

city 	Counter({'unicode': 8091})
review_count 	Counter({'int': 8091})
neighborhood 	Counter({'NoneType': 4086, 'unicode': 4005})
name 	Counter({'unicode': 8091})
business_id 	Counter({'unicode': 8091})
longitude 	Counter({'float': 8091})
postal_code 	Counter({'float': 8070, 'NoneType': 21})
hours 	Counter({'unicode': 5599, 'NoneType': 2492})
state 	Counter({'unicode': 8091})
is_open 	Counter({'int': 8091})
stars 	Counter({'float': 8091})
address 	Counter({'unicode': 7975, 'NoneType': 116})
latitude 	Counter({'float': 8091})
attributes 	Counter({'unicode': 7263, 'NoneType': 828})
type 	Counter({'unicode': 8091})
categories 	Counter({'unicode': 8080, 'NoneType': 11})


### Business Catagories
We wish to reduce the number of categories in Yelp to 12 main categories. In order to be able to get insights from those 12 categories, we will implement them as 12 Binary columns in our Business Dimention.


In [24]:
#Extract csv file- for mapping the categories
csv_filename = "category.csv"
map_categories_table = etl.fromcsv(csv_filename)
map_categories_table.display(5)

Text,Category,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
Plumbing,10,,,,,,,,,,
American (New),1,,,,,,,,,,
Hotels & Travel,7,,,,,,,,,,
Shopping,3,,,,,,,,,,
Food,1,,,,,,,,,,


In [25]:
#Create dictionary to map all categories to 10 main 
categories_dic = {}
for row in map_categories_table:
    categories_dic[row[0]] = row[1]

### Transform

In [26]:
#Choose only businesses from Pittsburgh
t2 = etl.select(t1, "{city} =='Pittsburgh'") 

In [27]:
#Order the fields (and display)
t3 = t2.cut(['business_id','name','is_open','neighborhood','latitude','longitude','stars','review_count','categories'])
t3.display(1)

business_id,name,is_open,neighborhood,latitude,longitude,stars,review_count,categories
P3LisOj7DktgGa7C5FYpnA,Benjamin Franklin Plumbing,1,West View,40.5341627,-80.0498873,4.0,8,"[u'Plumbing', u'Home Services']"


#### order and split the categories

In [28]:
#Create the function for the convertion that reduces the categories.
def reduce_categories(text):
    if text[0]=='[' and text[-1]==']':
        items =  text[1:-1].split(', ') # turn values to list
    else:
        return None
    output = {}
    for i in items:
        key = categories_dic.get(i[2:-1])
        value = 1
        output[key]=int(value)
    return output

#Test
sample = t3[1][8]
print reduce_categories(sample)
print type(reduce_categories(sample))

{u'10': 1}
<type 'dict'>


In [29]:
# Convert the data to reduce the categories
t4 = t3.convert('categories',reduce_categories)
t4.typecounter('categories')
#unpack the dictionary in category column to 12 seperate columns
t5 = t4.unpackdict('categories')
t5.display(1)

business_id,name,is_open,neighborhood,latitude,longitude,stars,review_count,None,1,10,11,12,2,3,4,5,6,7,8,9
P3LisOj7DktgGa7C5FYpnA,Benjamin Franklin Plumbing,1,West View,40.5341627,-80.0498873,4.0,8,,,1,,,,,,,,,,


In [30]:
# Order fileds and change category fields name from number to the real name
t6 = etl.rename(t5, {'1': 'food','2': 'art & enteraitment','3': 'stores','4': 'beauty & spa','5': 'health','6': 'finance','7': 'turists','8': 'cars & transportation', '9': 'bars & alcohol','10': 'other','11': 'fashion','12': 'real estate'})
t7 = t6.cut(['business_id','name','is_open','neighborhood','latitude','longitude','stars','review_count', 'food', 'art & enteraitment', 'stores', 'beauty & spa', 'health', 'finance', 'turists', 'cars & transportation', 'bars & alcohol', 'other', 'fashion', 'real estate'])
t7.display(10)

business_id,name,is_open,neighborhood,latitude,longitude,stars,review_count,food,art & enteraitment,stores,beauty & spa,health,finance,turists,cars & transportation,bars & alcohol,other,fashion,real estate
P3LisOj7DktgGa7C5FYpnA,Benjamin Franklin Plumbing,1,West View,40.5341627,-80.0498873,4.0,8,,,,,,,,,,1.0,,
93otbGHE0s0m-lU1osvg9w,Rivertowne,1,North Side,40.4459861,-80.0108802,3.0,102,1.0,1.0,,,,,,,1.0,,,
csdZolWIWvkIHIqsCmV6sg,City Vista,1,Greentree,40.4263863165,-80.037872133,2.0,5,,1.0,,,,,1.0,,,1.0,,1.0
h7ZW6c1Vi9V6_-gd2Ch_4g,Verizon Wireless Premium Retailer,1,Bloomfield,40.453725,-79.946995,2.0,5,,,1.0,,,,,,,,,
HWrbZS1mxVRj2Y2VwMmDMg,Dulcinea Bakeshop,0,Strip District,40.4553505818,-79.9777282029,4.5,3,1.0,,,,,,,,,,,
jDqaz1jKdeWk_fTVCzc1Pg,Conflict Kitchen,1,Oakland,40.44258082,-79.9527347618,4.5,145,1.0,,,,,,,,,,,
HYwbHY-srAOoH-t337DpMw,Tusca Restaurant,0,South Side,40.4273611,-79.964962,2.0,25,1.0,,,,,,,,,,,
sDl0SxE_AoOAnPCAxfoauA,Southside Laserwash,1,South Side,40.4286206,-79.9926092,2.0,4,,,,,,,,1.0,,,,
ZElIifFslL_948I-hI_ukw,Zovko Bros,1,South Side,40.427648,-79.970147,3.0,6,,,,,,,,1.0,,,,
ZYof0WYlilJ3pgWNSgmYKQ,Pizza Milano,1,The Hill District,40.438453,-79.987978,3.5,56,1.0,,,,,,,,,,,


In [31]:
# change None to 0 in the categories fields
def None_to_0(text):
    if text == None:
        return 0
    else:
        return text
t8 = etl.convert(t7, ('food', 'art & enteraitment', 'stores', 'beauty & spa', 'health', 'finance', 'turists', 'cars & transportation', 'bars & alcohol', 'other', 'fashion', 'real estate'), None_to_0)
t8.display(10)

business_id,name,is_open,neighborhood,latitude,longitude,stars,review_count,food,art & enteraitment,stores,beauty & spa,health,finance,turists,cars & transportation,bars & alcohol,other,fashion,real estate
P3LisOj7DktgGa7C5FYpnA,Benjamin Franklin Plumbing,1,West View,40.5341627,-80.0498873,4.0,8,0,0,0,0,0,0,0,0,0,1,0,0
93otbGHE0s0m-lU1osvg9w,Rivertowne,1,North Side,40.4459861,-80.0108802,3.0,102,1,1,0,0,0,0,0,0,1,0,0,0
csdZolWIWvkIHIqsCmV6sg,City Vista,1,Greentree,40.4263863165,-80.037872133,2.0,5,0,1,0,0,0,0,1,0,0,1,0,1
h7ZW6c1Vi9V6_-gd2Ch_4g,Verizon Wireless Premium Retailer,1,Bloomfield,40.453725,-79.946995,2.0,5,0,0,1,0,0,0,0,0,0,0,0,0
HWrbZS1mxVRj2Y2VwMmDMg,Dulcinea Bakeshop,0,Strip District,40.4553505818,-79.9777282029,4.5,3,1,0,0,0,0,0,0,0,0,0,0,0
jDqaz1jKdeWk_fTVCzc1Pg,Conflict Kitchen,1,Oakland,40.44258082,-79.9527347618,4.5,145,1,0,0,0,0,0,0,0,0,0,0,0
HYwbHY-srAOoH-t337DpMw,Tusca Restaurant,0,South Side,40.4273611,-79.964962,2.0,25,1,0,0,0,0,0,0,0,0,0,0,0
sDl0SxE_AoOAnPCAxfoauA,Southside Laserwash,1,South Side,40.4286206,-79.9926092,2.0,4,0,0,0,0,0,0,0,1,0,0,0,0
ZElIifFslL_948I-hI_ukw,Zovko Bros,1,South Side,40.427648,-79.970147,3.0,6,0,0,0,0,0,0,0,1,0,0,0,0
ZYof0WYlilJ3pgWNSgmYKQ,Pizza Milano,1,The Hill District,40.438453,-79.987978,3.5,56,1,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# add ID
t9 = t8.addrownumbers()
t10 = t9.rename({'row':'business_id', 'business_id':'Yelp_business_id', 'name':'business_name'})
t10.display()

business_id,Yelp_business_id,business_name,is_open,neighborhood,latitude,longitude,stars,review_count,food,art & enteraitment,stores,beauty & spa,health,finance,turists,cars & transportation,bars & alcohol,other,fashion,real estate
1,P3LisOj7DktgGa7C5FYpnA,Benjamin Franklin Plumbing,1,West View,40.5341627,-80.0498873,4.0,8,0,0,0,0,0,0,0,0,0,1,0,0
2,93otbGHE0s0m-lU1osvg9w,Rivertowne,1,North Side,40.4459861,-80.0108802,3.0,102,1,1,0,0,0,0,0,0,1,0,0,0
3,csdZolWIWvkIHIqsCmV6sg,City Vista,1,Greentree,40.4263863165,-80.037872133,2.0,5,0,1,0,0,0,0,1,0,0,1,0,1
4,h7ZW6c1Vi9V6_-gd2Ch_4g,Verizon Wireless Premium Retailer,1,Bloomfield,40.453725,-79.946995,2.0,5,0,0,1,0,0,0,0,0,0,0,0,0
5,HWrbZS1mxVRj2Y2VwMmDMg,Dulcinea Bakeshop,0,Strip District,40.4553505818,-79.9777282029,4.5,3,1,0,0,0,0,0,0,0,0,0,0,0


### Load

In [33]:
fields = t10.fieldnames()
for f in fields:
    print f,'\t', t10.typecounter(f)

business_id 	Counter({'int': 5275})
Yelp_business_id 	Counter({'unicode': 5275})
business_name 	Counter({'unicode': 5275})
is_open 	Counter({'int': 5275})
neighborhood 	Counter({'unicode': 3646, 'NoneType': 1629})
latitude 	Counter({'float': 5275})
longitude 	Counter({'float': 5275})
stars 	Counter({'float': 5275})
review_count 	Counter({'int': 5275})
food 	Counter({'int': 5275})
art & enteraitment 	Counter({'int': 5275})
stores 	Counter({'int': 5275})
beauty & spa 	Counter({'int': 5275})
health 	Counter({'int': 5275})
finance 	Counter({'int': 5275})
turists 	Counter({'int': 5275})
cars & transportation 	Counter({'int': 5275})
bars & alcohol 	Counter({'int': 5275})
other 	Counter({'int': 5275})
fashion 	Counter({'int': 5275})
real estate 	Counter({'int': 5275})


In [34]:
# setting a new cursor
cur = con.cursor()

In [35]:
cur.execute('USE yelp_Pittsburgh')
cur.execute('SET SQL_MODE=ANSI_QUOTES')
# append the data to existing tables
t10.appenddb(cur,'dim_business',schema='yelp_pittsburgh', commit=True)

  r = r + self.execute(query, a)
  r = r + self.execute(query, a)


In [36]:
#Closing the Cursor - Do we need to close and open it every time?
cur.close()

## Date

In [37]:
# setting a cursor
cur = con.cursor()     # get the cursor

#### append data to existing tables

In [43]:
cur.execute("SET @currdate := '2005-01-01';")
cur.execute("SET @enddate := '2018-01-01';")
cur.execute("DROP PROCEDURE IF EXISTS BuildDate")
 
cur.execute(""" CREATE PROCEDURE BuildDate()
         BEGIN
        WHILE @currdate < @enddate DO
            INSERT INTO dim_dates  (date, day, month, year, day_name)
            VALUES (  @currdate,  DAY(@currdate),  MONTH(@currdate), 
              YEAR(@currdate), DAYNAME(@currdate));
            SET @currdate := DATE_ADD(@currdate, INTERVAL 1 DAY);
END WHILE;
    END  """)
 
cur.execute('CALL BuildDate();')
con.commit()


  app.launch_new_instance()


### Add Holidays

In [44]:
# New Year
cur.execute("""
update dim_dates 
set holiday = 'New Year''s Day'
where day = 01 and month = 01;
""")

# Halloween
cur.execute("""
update dim_dates 
set holiday = 'Halloween'
where day = 31 and month = 10;
""")

# Independence Day
cur.execute("""
update dim_dates 
set holiday = 'Independence Day, U.S'
where day = 04 and month = 07;
""")

# Memorial Day
cur.execute("""
update dim_dates 
set holiday = 'Memorial Day'
where date = '2005-05-30' or date = '2006-05-29' or date = '2007-05-28' or date = '2008-05-26' or date = '2009-05-25'
or date = '2010-05-31' or date = '2011-05-30' or date = '2012-05-28' or date = '2013-05-27' or date = '2014-05-26'
or date = '2015-05-25' or date = '2016-05-30' or date = '2017-05-29';
""")

# Thanksgiving
cur.execute("""
update dim_dates 
set holiday = 'Thanksgiving Day'
where date = '2005-11-23' or date = '2006-11-22' or date = '2007-11-28' or date = '2008-11-27' or date = '2009-11-25'
or date = '2010-11-24' or date = '2011-11-23' or date = '2012-11-22' or date = '2013-11-27' or date = '2014-11-26'
or date = '2015-11-25' or date = '2016-11-24' or date = '2017-11-23';
""")

# Christmas Day
cur.execute("""
update dim_dates 
set holiday = 'Christmas Day'
where day = 25 and month = 12;
""")

# Christmas Eve
cur.execute("""
update dim_dates 
set holiday = 'Christmas Eve'
where day = 24 and month = 12;
""")

# Black Friday
cur.execute("""
update dim_dates 
set holiday = 'Black Friday'
where day_name = 'Friday' and month = 11 and day between 21 and 29;
""")


16L

In [45]:
con.commit()
cur.close()

## Reviews

### Extract

In [47]:
filename= 'yelp_academic_dataset_review_drop_PA.json'

In [48]:
t1 = etl.fromjson(filename)

fields = t1.fieldnames()

for f in fields:
    print f,'\t', t1.typecounter(f)

funny 	Counter({'int': 179774})
user_id 	Counter({'unicode': 179774})
review_id 	Counter({'unicode': 179774})
business_id 	Counter({'unicode': 179774})
stars 	Counter({'int': 179774})
date 	Counter({'unicode': 179774})
useful 	Counter({'int': 179774})
type 	Counter({'unicode': 179774})
cool 	Counter({'int': 179774})


Due to the current types of data, we need to do the following:
 1. Add ascending numbers to the rows and call it "review_id" (this will be used for the auto-increamented review_ID field).
 2. Aggragate the columns `funny`, `cool` and `useful` to the "votes_aggregate" column.
 3. Change the user_id, business_id, date_id to their Foreign keys (in the DB). This will be done during the Load.


In [49]:
t2 = t1.addrownumbers()
t2.display(10)

row,funny,user_id,review_id,business_id,stars,date,useful,type,cool
1,0,hzw-qTUVpmLAKjdkoUNh8A,Awq_6cyNjK1-qPZAwnXjjQ,7p6tHUA1Pknh0DVWqz86lA,1,2016-08-27,0,review,0
2,0,mldKxVI59o3LhK3ITG6mnA,96YkAuJzlT54qZZWNebFUg,7p6tHUA1Pknh0DVWqz86lA,5,2015-06-15,0,review,0
3,0,SaedHW9i7k4lHR8tgwtMgQ,OfZRG7RgKA118zDtj6yo-g,7p6tHUA1Pknh0DVWqz86lA,5,2014-01-16,0,review,0
4,0,87CKG39VfXYRupM3VRfReg,YEgNOZDCeLuQimfNVYC2AA,7p6tHUA1Pknh0DVWqz86lA,1,2013-10-09,2,review,0
5,0,kESRYcaODjB6s9p1-alBTw,tEBhqaLMYBvxhkdxr1Pr7g,7p6tHUA1Pknh0DVWqz86lA,5,2015-02-25,1,review,1
6,0,07XUD-8jPwtpgMGYT_veJw,HvlhVHFsjqr1SXiJLqPtNA,7p6tHUA1Pknh0DVWqz86lA,1,2013-01-03,9,review,0
7,0,-9YbyjrujLmFVORF3PA0YQ,S-JbWPzCCxgP288Ao-_qYQ,95s7ZRceq-mYD-a7DZntnQ,1,2015-08-04,0,review,0
8,0,V6IBbUN_bBfzd5s8LlLTlw,L2eNJp3tYOoM9Kr0ouDztg,95s7ZRceq-mYD-a7DZntnQ,1,2014-12-08,1,review,0
9,1,-UPUYET3Pwm99zq-uHl7gg,OPBHCs-lcEVWbEZY7rKNUg,0Wy4gw8krao9nGq-sHWFSQ,3,2013-12-01,1,review,1
10,0,WLrFY6_Z32lb_ifcIvRvkw,SjgayD75MU7yTWEs_cGbwg,0Wy4gw8krao9nGq-sHWFSQ,3,2013-05-06,0,review,0


In [50]:
t3 = etl.rename(t2, 'review_id', 'yelp_review_id')
t4 = etl.rename(t3, 'row', 'review_id')
t5 = etl.rename(t4, 'date','date_id')

In [51]:
def votes_aggregate(row):
    return row['funny']+row['cool']+row['useful']

t6=t5.addfield('votes_aggregate', votes_aggregate)
t6.display(10)

review_id,funny,user_id,yelp_review_id,business_id,stars,date_id,useful,type,cool,votes_aggregate
1,0,hzw-qTUVpmLAKjdkoUNh8A,Awq_6cyNjK1-qPZAwnXjjQ,7p6tHUA1Pknh0DVWqz86lA,1,2016-08-27,0,review,0,0
2,0,mldKxVI59o3LhK3ITG6mnA,96YkAuJzlT54qZZWNebFUg,7p6tHUA1Pknh0DVWqz86lA,5,2015-06-15,0,review,0,0
3,0,SaedHW9i7k4lHR8tgwtMgQ,OfZRG7RgKA118zDtj6yo-g,7p6tHUA1Pknh0DVWqz86lA,5,2014-01-16,0,review,0,0
4,0,87CKG39VfXYRupM3VRfReg,YEgNOZDCeLuQimfNVYC2AA,7p6tHUA1Pknh0DVWqz86lA,1,2013-10-09,2,review,0,2
5,0,kESRYcaODjB6s9p1-alBTw,tEBhqaLMYBvxhkdxr1Pr7g,7p6tHUA1Pknh0DVWqz86lA,5,2015-02-25,1,review,1,2
6,0,07XUD-8jPwtpgMGYT_veJw,HvlhVHFsjqr1SXiJLqPtNA,7p6tHUA1Pknh0DVWqz86lA,1,2013-01-03,9,review,0,9
7,0,-9YbyjrujLmFVORF3PA0YQ,S-JbWPzCCxgP288Ao-_qYQ,95s7ZRceq-mYD-a7DZntnQ,1,2015-08-04,0,review,0,0
8,0,V6IBbUN_bBfzd5s8LlLTlw,L2eNJp3tYOoM9Kr0ouDztg,95s7ZRceq-mYD-a7DZntnQ,1,2014-12-08,1,review,0,1
9,1,-UPUYET3Pwm99zq-uHl7gg,OPBHCs-lcEVWbEZY7rKNUg,0Wy4gw8krao9nGq-sHWFSQ,3,2013-12-01,1,review,1,3
10,0,WLrFY6_Z32lb_ifcIvRvkw,SjgayD75MU7yTWEs_cGbwg,0Wy4gw8krao9nGq-sHWFSQ,3,2013-05-06,0,review,0,0


In [52]:
t7 = t6.cut(['yelp_review_id', 'user_id', 'business_id', 'date_id', 'stars', 'votes_aggregate' ])

In [53]:
t7.display(10)

yelp_review_id,user_id,business_id,date_id,stars,votes_aggregate
Awq_6cyNjK1-qPZAwnXjjQ,hzw-qTUVpmLAKjdkoUNh8A,7p6tHUA1Pknh0DVWqz86lA,2016-08-27,1,0
96YkAuJzlT54qZZWNebFUg,mldKxVI59o3LhK3ITG6mnA,7p6tHUA1Pknh0DVWqz86lA,2015-06-15,5,0
OfZRG7RgKA118zDtj6yo-g,SaedHW9i7k4lHR8tgwtMgQ,7p6tHUA1Pknh0DVWqz86lA,2014-01-16,5,0
YEgNOZDCeLuQimfNVYC2AA,87CKG39VfXYRupM3VRfReg,7p6tHUA1Pknh0DVWqz86lA,2013-10-09,1,2
tEBhqaLMYBvxhkdxr1Pr7g,kESRYcaODjB6s9p1-alBTw,7p6tHUA1Pknh0DVWqz86lA,2015-02-25,5,2
HvlhVHFsjqr1SXiJLqPtNA,07XUD-8jPwtpgMGYT_veJw,7p6tHUA1Pknh0DVWqz86lA,2013-01-03,1,9
S-JbWPzCCxgP288Ao-_qYQ,-9YbyjrujLmFVORF3PA0YQ,95s7ZRceq-mYD-a7DZntnQ,2015-08-04,1,0
L2eNJp3tYOoM9Kr0ouDztg,V6IBbUN_bBfzd5s8LlLTlw,95s7ZRceq-mYD-a7DZntnQ,2014-12-08,1,1
OPBHCs-lcEVWbEZY7rKNUg,-UPUYET3Pwm99zq-uHl7gg,0Wy4gw8krao9nGq-sHWFSQ,2013-12-01,3,3
SjgayD75MU7yTWEs_cGbwg,WLrFY6_Z32lb_ifcIvRvkw,0Wy4gw8krao9nGq-sHWFSQ,2013-05-06,3,0


In [77]:
# setting a cursor
cur = con.cursor()

cur.execute('USE yelp_Pittsburgh')
cur.execute('SET SQL_MODE=ANSI_QUOTES')

# append the data to existing tables - loading by "chunks"
start = 0
end = 20000
row_num = etl.nrows(t7)
while end < row_num:
    t7.rowslice(start, end).appenddb(cur,'fact_reviews',schema='yelp_pittsburgh', commit=True)
    start = end
    end += 20000

    #Add the last chunk of data
t7.rowslice(start, row_num).appenddb(cur,'fact_reviews',schema='yelp_pittsburgh', commit=True)

creating the foreign key by 'join' action with the other dims

In [None]:
cur.execute('''UPDATE fact_reviews, dim_dates
SET fact_reviews.date_id = dim_dates.date_id 
WHERE DATE(fact_reviews.date_id) = dim_dates.date;''')

In [57]:
cur.execute('''UPDATE fact_reviews, dim_users
SET fact_reviews.user_id = dim_users.user_id 
WHERE fact_reviews.user_id = dim_users.yelp_user_id;''')

0L

In [58]:
cur.execute('''UPDATE fact_reviews, dim_business
SET fact_reviews.business_id = dim_business.business_id 
WHERE fact_reviews.business_id = dim_business.Yelp_business_id;''')

0L

In [60]:
con.commit()
cur.close()