# 1. Creating the Scheme in the Local Host

### Create MYSQL connector

In [None]:
import MySQLdb as mdb

In [None]:
# creating connection
try:
    con = mdb.connect(
                   host = '127.0.0.1', user = 'root', passwd = 'root', use_unicode=True, charset="utf8")
    print "The password is `root`"
except:
    print "The password in not `root`. trying `1234`"
    con = mdb.connect(
                   host = '127.0.0.1', user = 'root', passwd = '1234', use_unicode=True, charset="utf8")

In [None]:
# setting a cursor
cursor = con.cursor()     # get the cursor

### DDL - Create Schema

In [None]:
cursor.execute("DROP SCHEMA IF EXISTS Yelp_Pittsburgh ;")
cursor.execute("CREATE SCHEMA IF NOT EXISTS Yelp_Pittsburgh ;")
cursor.execute("USE Yelp_Pittsburgh ;")
cursor.execute('SET SQL_MODE=ANSI_QUOTES') #important for petl operation - use this kind of quotes

### Create Tables

In [None]:
#dim_users table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS dim_users (
        `user_id` int(11) NOT NULL AUTO_INCREMENT,
        `yelp_user_id` VARCHAR(20) NOT NULL,
        `review_count` INT(11),
        `fans` INT(11),
        `is_elite` BOOLEAN NOT NULL,
        `yelping_since` DATE,
        `friends_count` INT(11) NOT NULL,
        PRIMARY KEY (`user_id`));
        """)

In [None]:
#dim_business table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS dim_business (       
        `business_id` INT(11) NOT NULL AUTO_INCREMENT,
        `Yelp_business_id` VARCHAR(20) NOT NULL,
        `business_name` VARCHAR(20) NOT NULL,
        `is_open` BOOLEAN NOT NULL,
        `neighborhood` VARCHAR(20),
        `latitude` FLOAT(11),
        `longitude` FLOAT(11),
        `stars` FLOAT(11),
        `review_count` INT(11) NOT NULL,
        `food` BOOLEAN NOT NULL,
        `art & enteraitment` BOOLEAN NOT NULL,
        `stores` BOOLEAN NOT NULL,
        `beauty & spa` BOOLEAN NOT NULL,
        `health` BOOLEAN NOT NULL,
        `finance` BOOLEAN NOT NULL,
        `turists` BOOLEAN NOT NULL,
        `cars & transportation` BOOLEAN NOT NULL,
        `bars & alcohol` BOOLEAN NOT NULL,
        `other` BOOLEAN NOT NULL,
        `fashion` BOOLEAN NOT NULL,
        `real estate` BOOLEAN NOT NULL,
        
        PRIMARY KEY (`business_id`));
        """)

In [None]:
#dim_dates table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS dim_dates (       
        `date_id` INT(11) NOT NULL AUTO_INCREMENT,
        `date` DATE NOT NULL,
        `day` INT(11) NOT NULL,
        `month` INT(11) NOT NULL,
        `year` INT(11),
        `day_name` VARCHAR(45) NOT NULL,
        `holiday` VARCHAR(45),
        PRIMARY KEY (`date_id`));
        """)

In [None]:
#fact_review table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS fact_reviews (
        `review_id` INT(11) NOT NULL AUTO_INCREMENT,
        `yelp_review_id` VARCHAR(20) NOT NULL,
        `user_id` INT(11) NOT NULL,
        `business_id` INT(11) NOT NULL,
        `date_id` int(11) NOT NULL,
        `stars` FLOAT(11),
        `votes_aggregate` INT(11),
        PRIMARY KEY (`review_id`));
        """)

In [None]:
#commit changes
con.commit()

### Closing the connection

In [None]:
#closing the connection
cursor.close()
con.close()

# 2. ETL

In [None]:
import petl as etl
import datetime

## Users

### Extract

In [None]:
# Extract JSON file and display
filename= 'yelp_academic_dataset_users_nofriendlist_PA.json'
t1 = etl.fromjson(filename)
t1.display(10)

In [None]:
'''creating the users dimension:
user_id= varchar(20)
friends_count= INT
review_count= INT
fans= INT
is_elite= Binary
yelping_since= Date'''

t2 = t1.cut(['user_id','review_count','fans','elite','yelping_since'])
t2.display(10)

#### Transform

In [None]:
fields = t2.fieldnames()
for f in fields:
    print f,'\t', t2.typecounter(f)

In [None]:
#convet unicode type to binary (creating is_elite field)
def to_binary(text):
    if text[3:7] == 'None':
        return 0 
    return 1

t3 = t2.convert('elite' , to_binary)
t3.display(10)

In [None]:
#build the friends_count column by join (users and friends)

source = 'Pittsburgh_full_friend_text.json'
t4 = etl.fromjson(source)
t5 = t3.join(t4, # right table
                   lkey='user_id',rkey='user_id', #join equality columns
                   rprefix='t4_') # prefixes of columns from each table (not mandatory)

def friend_count(text):
    if text[0]=='[' and text[-1]==']':
        items =  text[1:-1].split(', ') # turn values to list
        if text[3:7]== 'None':
            return 0
        return len(items)

t6 = t5.convert('t4_friends' , friend_count)
t6.display(10)

In [None]:
# adding new user ID and rename the table

t7 = t6.addrownumbers()
t8 = t7.rename({'row':'user_id','user_id':'yelp_user_id','elite':'is_elite','t4_friends':'friends_count'})
t8.display(10)

In [None]:
#Validate the fields type
fields = t8.fieldnames()
for f in fields:
    print f,'\t', t8.typecounter(f)

#### Load

In [None]:
import MySQLdb as mdb
try:
    con = mdb.connect(
                   host = '127.0.0.1', user = 'root', passwd = 'root', use_unicode=True, charset="utf8")
except:
    con = mdb.connect(
                   host = '127.0.0.1', user = 'root', passwd = '1234', use_unicode=True, charset="utf8")
# setting a cursor
cur = con.cursor()     # get the cursor

In [None]:
# append the data to existing tables
cur.execute('SET SQL_MODE=ANSI_QUOTES')
t8.appenddb(cur,'dim_users',schema='yelp_pittsburgh',commit=True)

In [None]:
#Closing the Cursor
cur.close()

## Business

### Extract

In [None]:
# Open and display the bussiness json file.
filename= 'yelp_academic_dataset_business_PA.json'
t1 = etl.fromjson(filename)
t1.display(10)

In [None]:
#Check the types of the data
fields = t1.fieldnames()
for f in fields:
    print f,'\t', t1.typecounter(f)

### Business Catagories
We wish to reduce the number of categories in Yelp to 12 main categories. In order to be able to get insights from those 12 categories, we will implement them as 12 Binary columns in our Business Dimention.


In [None]:
#Extract csv file- for mapping the categories
csv_filename = "category.csv"
map_categories_table = etl.fromcsv(csv_filename)
map_categories_table.display(5)

In [None]:
#Create dictionary to map all categories to 10 main 
categories_dic = {}
for row in map_categories_table:
    categories_dic[row[0]] = row[1]

### Transform

In [None]:
#Choose only businesses from Pittsburgh
t2 = etl.select(t1, "{city} =='Pittsburgh'") 

In [None]:
#Order the fields (and display)
t3 = t2.cut(['business_id','name','is_open','neighborhood','latitude','longitude','stars','review_count','categories'])
t3.display(1)

#### order and split the categories

In [None]:
#Create the function for the convertion that reduces the categories.
def reduce_categories(text):
    if text[0]=='[' and text[-1]==']':
        items =  text[1:-1].split(', ') # turn values to list
    else:
        return None
    output = {}
    for i in items:
        key = categories_dic.get(i[2:-1])
        value = 1
        output[key]=int(value)
    return output

#Test
sample = t3[1][8]
print reduce_categories(sample)
print type(reduce_categories(sample))

In [None]:
# Convert the data to reduce the categories
t4 = t3.convert('categories',reduce_categories)
t4.typecounter('categories')
#unpack the dictionary in category column to 12 seperate columns
t5 = t4.unpackdict('categories')
t5.display(1)

In [None]:
# Order fileds and change category fields name from number to the real name
t6 = etl.rename(t5, {'1': 'food','2': 'art & enteraitment','3': 'stores','4': 'beauty & spa','5': 'health','6': 'finance','7': 'turists','8': 'cars & transportation', '9': 'bars & alcohol','10': 'other','11': 'fashion','12': 'real estate'})
t7 = t6.cut(['business_id','name','is_open','neighborhood','latitude','longitude','stars','review_count', 'food', 'art & enteraitment', 'stores', 'beauty & spa', 'health', 'finance', 'turists', 'cars & transportation', 'bars & alcohol', 'other', 'fashion', 'real estate'])
t7.display(10)

In [None]:
# change None to 0 in the categories fields
def None_to_0(text):
    if text == None:
        return 0
    else:
        return text
t8 = etl.convert(t7, ('food', 'art & enteraitment', 'stores', 'beauty & spa', 'health', 'finance', 'turists', 'cars & transportation', 'bars & alcohol', 'other', 'fashion', 'real estate'), None_to_0)
t8.display(10)

In [None]:
# add ID
t9 = t8.addrownumbers()
t10 = t9.rename({'row':'business_id', 'business_id':'Yelp_business_id', 'name':'business_name'})
t10.display()

### Load

In [None]:
fields = t10.fieldnames()
for f in fields:
    print f,'\t', t10.typecounter(f)

In [None]:
# setting a new cursor
cur = con.cursor()

In [None]:
cur.execute('USE yelp_Pittsburgh')
cur.execute('SET SQL_MODE=ANSI_QUOTES')
# append the data to existing tables
t10.appenddb(cur,'dim_business',schema='yelp_pittsburgh', commit=True)

In [None]:
#Closing the Cursor - Do we need to close and open it every time?
cur.close()

## Date

In [None]:
# setting a cursor
cur = con.cursor()     # get the cursor

#### append data to existing tables

In [None]:
cur.execute("SET @currdate := '2005-01-01';")
cur.execute("SET @enddate := '2018-01-01';")
cur.execute("DROP PROCEDURE IF EXISTS BuildDate")
 
cur.execute(""" CREATE PROCEDURE BuildDate()
         BEGIN
        WHILE @currdate < @enddate DO
            INSERT INTO dim_dates  (date, day, month, year, day_name)
            VALUES (  @currdate,  DAY(@currdate),  MONTH(@currdate), 
              YEAR(@currdate), DAYNAME(@currdate));
            SET @currdate := DATE_ADD(@currdate, INTERVAL 1 DAY);
END WHILE;
    END  """)
 
cur.execute('CALL BuildDate();')
con.commit()


### Add Holidays

In [None]:
# New Year
cur.execute("""
update dim_dates 
set holiday = 'New Year''s Day'
where day = 01 and month = 01;
""")

# Halloween
cur.execute("""
update dim_dates 
set holiday = 'Halloween'
where day = 31 and month = 10;
""")

# Independence Day
cur.execute("""
update dim_dates 
set holiday = 'Independence Day, U.S'
where day = 04 and month = 07;
""")

# Memorial Day
cur.execute("""
update dim_dates 
set holiday = 'Memorial Day'
where date = '2005-05-30' or date = '2006-05-29' or date = '2007-05-28' or date = '2008-05-26' or date = '2009-05-25'
or date = '2010-05-31' or date = '2011-05-30' or date = '2012-05-28' or date = '2013-05-27' or date = '2014-05-26'
or date = '2015-05-25' or date = '2016-05-30' or date = '2017-05-29';
""")

# Thanksgiving
cur.execute("""
update dim_dates 
set holiday = 'Thanksgiving Day'
where date = '2005-11-23' or date = '2006-11-22' or date = '2007-11-28' or date = '2008-11-27' or date = '2009-11-25'
or date = '2010-11-24' or date = '2011-11-23' or date = '2012-11-22' or date = '2013-11-27' or date = '2014-11-26'
or date = '2015-11-25' or date = '2016-11-24' or date = '2017-11-23';
""")

# Christmas Day
cur.execute("""
update dim_dates 
set holiday = 'Christmas Day'
where day = 25 and month = 12;
""")

# Christmas Eve
cur.execute("""
update dim_dates 
set holiday = 'Christmas Eve'
where day = 24 and month = 12;
""")

# Black Friday
cur.execute("""
update dim_dates 
set holiday = 'Black Friday'
where day_name = 'Friday' and month = 11 and day between 21 and 29;
""")


In [None]:
con.commit()
cur.close()

## Reviews

### Extract

In [None]:
filename= 'yelp_academic_dataset_review_drop_PA.json'

In [None]:
t1 = etl.fromjson(filename)

fields = t1.fieldnames()

for f in fields:
    print f,'\t', t1.typecounter(f)

Due to the current types of data, we need to do the following:
 1. Add ascending numbers to the rows and call it "review_id" (this will be used for the auto-increamented review_ID field).
 2. Aggragate the columns `funny`, `cool` and `useful` to the "votes_aggregate" column.
 3. Change the user_id, business_id, date_id to their Foreign keys (in the DB). This will be done during the Load.


In [None]:
t2 = t1.addrownumbers()
t2.display(10)

In [None]:
t3 = etl.rename(t2, 'review_id', 'yelp_review_id')
t4 = etl.rename(t3, 'row', 'review_id')
t5 = etl.rename(t4, 'date','date_id')

In [None]:
def votes_aggregate(row):
    return row['funny']+row['cool']+row['useful']

t6=t5.addfield('votes_aggregate', votes_aggregate)
t6.display(10)

In [None]:
t7 = t6.cut(['yelp_review_id', 'user_id', 'business_id', 'date_id', 'stars', 'votes_aggregate' ])

In [None]:
t7.display(10)

In [None]:
# setting a cursor
cur = con.cursor()

cur.execute('USE yelp_Pittsburgh')
cur.execute('SET SQL_MODE=ANSI_QUOTES')

# append the data to existing tables - loading by "chunks"
start = 0
end = 20000
row_num = etl.nrows(t7)
while end < row_num:
    t7.rowslice(start, end).appenddb(cur,'fact_reviews',schema='yelp_pittsburgh', commit=True)
    start = end
    end += 20000

    #Add the last chunk of data
t7.rowslice(start, row_num).appenddb(cur,'fact_reviews',schema='yelp_pittsburgh', commit=True)

creating the foreign key by 'join' action with the other dims

In [None]:
cur.execute('''UPDATE fact_reviews, dim_dates
SET fact_reviews.date_id = dim_dates.date_id 
WHERE DATE(fact_reviews.date_id) = dim_dates.date;''')

In [None]:
cur.execute('''UPDATE fact_reviews, dim_users
SET fact_reviews.user_id = dim_users.user_id 
WHERE fact_reviews.user_id = dim_users.yelp_user_id;''')

In [None]:
cur.execute('''UPDATE fact_reviews, dim_business
SET fact_reviews.business_id = dim_business.business_id 
WHERE fact_reviews.business_id = dim_business.Yelp_business_id;''')

In [None]:
con.commit()
cur.close()