In [1]:
import numpy as np
import pandas as pd
import ast
#import psycopg2

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
!ls csv_files

yelp_academic_dataset_business.csv  yelp_academic_dataset_tip_transposed.csv
yelp_academic_dataset_review.csv    yelp_academic_dataset_user.csv
yelp_academic_dataset_tip.csv


In [4]:
generic = lambda x: ast.literal_eval(x)
conv = {'friends': generic}

In [5]:
business = pd.read_csv('csv_files/yelp_academic_dataset_business.csv')
review = pd.read_csv('csv_files/yelp_academic_dataset_review.csv')
tip = pd.read_csv('csv_files/yelp_academic_dataset_tip_transposed.csv')
user = pd.read_csv('csv_files/yelp_academic_dataset_user.csv',converters=conv)

In [6]:
business.index += 1
review.index += 1
tip.index += 1
user.index += 1

# Parse user

In [7]:
user_ids = user['user_id'].reset_index().set_index('user_id')['index'].to_dict()

In [8]:
friends = user['friends'].progress_map(lambda friends: list(map(lambda x: user_ids[x],friends)))

100%|██████████| 778651/778651 [00:12<00:00, 62044.79it/s] 


In [9]:
elite = user[user['elite'].progress_map(lambda x:type(x))==type("")]['elite'].map(lambda e:list(map(lambda x:int(x),e.split(","))))

100%|██████████| 778651/778651 [00:01<00:00, 609847.23it/s]


In [10]:
del user["user_id"]
del user["friends"]
del user["elite"]
user_table = user.reset_index().rename(columns={'index':'id'})

In [11]:
user_table.head()

Unnamed: 0,id,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,fans,funny,name,review_count,useful,yelping_since
0,1,4.03,1,0,1,2,0,0,1,0,1,0,2,25,5,17,Rashmi,95,84,2013-10-08 23:11:33
1,2,3.63,1,0,1,1,0,0,0,0,0,0,0,16,4,22,Jenna,33,48,2013-02-21 22:29:06
2,3,3.71,0,0,0,0,0,0,1,0,0,0,0,10,0,8,David,16,28,2013-10-04 00:16:10
3,4,4.85,0,0,0,1,0,0,0,0,2,0,1,14,5,4,Angela,17,30,2014-05-22 15:57:30
4,5,4.08,80,0,80,28,1,1,16,5,57,0,25,665,39,279,Nancy,361,1114,2013-10-23 07:02:50


In [12]:
user_table.to_csv('generated/user.csv', index=False)

In [13]:
friends_temp=friends.reset_index().rename(columns={'index':'id'})

In [14]:
chunks = np.array_split(friends_temp, 100000)

processed = []
for chunk in tqdm(chunks):
    processed.append(chunk['friends']
        .apply(lambda x: pd.Series(x))
        .stack()
        .reset_index(level=1, drop=True)
        .to_frame('friends')
        .join(chunk[['id']], how='left')
    )

friends_table = pd.concat(processed)

100%|██████████| 100000/100000 [15:15<00:00, 109.22it/s]


In [15]:
friends_table["friends"] = friends_table["friends"].astype(int)
friends_table=friends_table.rename(columns={'id':'user_id_1'})
friends_table=friends_table.rename(columns={'friends':'user_id_2'})

In [16]:
friends_table.to_csv('generated/are_friends.csv', index=False)

In [17]:
friends_table.head()

Unnamed: 0,user_id_2,user_id_1
0,292,1
0,126239,1
0,272957,1
0,30954,1
0,18898,1


In [18]:
elite_temp=elite.reset_index().rename(columns={'index':'id'})

In [19]:
elite_temp.head()

Unnamed: 0,id,elite
0,1,"[2015, 2016, 2017]"
1,5,"[2015, 2016, 2017, 2018]"
2,6,"[2015, 2016, 2017, 2018]"
3,7,"[2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013]"
4,9,"[2006, 2007, 2008, 2009, 2010, 2011, 2012]"


In [20]:
elite_table=(elite_temp['elite'].progress_apply(lambda x: pd.Series(x))
   .stack()
   .reset_index(level=1, drop=True)
   .to_frame('elite')
   .join(elite_temp[['id']], how='left'))

100%|██████████| 70559/70559 [00:17<00:00, 3977.68it/s]


In [21]:
elite_table["elite"] = elite_table["elite"].astype(int)
elite_table=elite_table.rename(columns={'elite':'annee'})

In [22]:
elite_table.head()

Unnamed: 0,annee,id
0,2015,1
0,2016,1
0,2017,1
1,2015,5
1,2016,5


In [23]:
elite_table.to_csv('generated/elite_years.csv', index=False)

# Parse business

In [24]:
business_ids = business['business_id'].reset_index().set_index('business_id')['index'].to_dict()

# Parse review

In [25]:
review_ids = review['review_id'].reset_index().set_index('review_id')['index'].to_dict()

In [26]:
excluded_review = review[(review.user_id.isin(user_ids.keys()) & review.business_id.isin(business_ids.keys())).map(lambda x:not x)]

In [27]:
included_review = review[review.user_id.isin(user_ids.keys()) & review.business_id.isin(business_ids.keys())]

In [28]:
excluded_review

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
199242,41b2SLmjLcxTGLVRxASiDA,0,2006-04-11 09:05:18,0.0,WWYQ1ce6mNt7AvRHu8w-jQ,3.0,Rating purely on food and 18th hole view: 5 stars,,
199243,,1,53bZ_EsXH71L7iFs5MP9_w,,,,,,


In [29]:
included_review.loc[:,'user_id'] = included_review.apply(lambda x: user_ids[x.user_id], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [30]:
included_review.loc[:,'business_id'] = included_review.progress_apply(lambda x: business_ids[x.business_id], axis=1)

100%|██████████| 918678/918678 [00:23<00:00, 39822.46it/s]


In [31]:
review_table = included_review

In [32]:
review_table.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
1,107045,0,2016-11-09 20:09:03,0.0,2TzJjDVDEuAW6MR5Vuc1ug,5.0,I have to say that this office really has it t...,3.0,65413
2,30138,0,2018-01-30 23:07:38,0.0,11a8sVPMUFtaC7_ABRkmtw,1.0,Today was my second out of three sessions I ha...,7.0,8027
3,112123,5,2016-05-07 01:21:02,4.0,G7XHMxG0bx9oBJNECG4IFg,3.0,Tracy dessert had a big name in Hong Kong and ...,5.0,11467
4,87448,0,2017-12-15 23:27:08,1.0,-I5umRTkhw15RqpKMl_o1Q,1.0,"Walked in around 4 on a Friday afternoon, we s...",0.0,29616
5,15683,0,2014-12-17 19:04:33,0.0,0AsmPiAQduxh5jE_si8cLA,5.0,ended up here because Raku was closed and it r...,0.0,21932


# Parse tip

In [33]:
excluded_tip = tip[(tip.user_id.isin(user_ids.keys()) & tip.business_id.isin(business_ids.keys())).map(lambda x:not x)]

In [34]:
included_tip = tip[tip.user_id.isin(user_ids.keys()) & tip.business_id.isin(business_ids.keys())]

In [35]:
excluded_tip

Unnamed: 0,business_id,compliment_count,date,text,user_id


In [36]:
included_tip.loc[:,'user_id'] = included_tip.progress_apply(lambda x: user_ids[x.user_id], axis=1)

100%|██████████| 1029047/1029047 [00:26<00:00, 39368.86it/s]


In [37]:
included_tip.loc[:,'business_id'] = included_tip.progress_apply(lambda x: business_ids[x.business_id], axis=1)

100%|██████████| 1029047/1029047 [00:25<00:00, 39983.46it/s]


In [38]:
tip_table = included_tip

In [39]:
tip_table.head()

Unnamed: 0,business_id,compliment_count,date,text,user_id
1,19903,0,2014-03-27 03:51:24,"Great for watching games, ufc, and whatever el...",2221
2,121777,0,2013-05-25 06:00:56,Happy Hour 2-4 daily with 1/2 price drinks and...,38143
3,152691,0,2011-12-26 01:46:17,Good chips and salsa. Loud at times. Good serv...,27342
4,10040,0,2014-03-23 21:32:49,The setting and decoration here is amazing. Co...,3217
5,109424,0,2012-10-06 00:19:27,Molly is definately taking a picture with Sant...,30346


# Database insert

## Open database connection

In [40]:
con = psycopg2.connect(database="introdb")
print("Connection opened")
cur = con.cursor()

NameError: name 'psycopg2' is not defined

## Insert users

In [None]:
user_table.head()

In [None]:
user_table.to_csv('generated/user.csv')

In [None]:
# stores states
cur = con.cursor()
for index, u in user_table.iterrows():
    cur.execute("""insert into "user" (id,
                                        name,
                                        yelping_since,
                                        compliment_cool,
                                        compliment_cute,
                                        compliment_funny,
                                        compliment_hot,
                                        compliment_list,
                                        compliment_more,
                                        compliment_note,
                                        compliment_photos,
                                        compliment_plain,
                                        compliment_profile,
                                        compliment_writer,
                                        cool,
                                        fans,
                                        funny,
                                        useful,
                                        average_stars,
                                        review_count) 
        values ({id},
                {name},
                '{yelping_since}',
                {compliment_cool},
                {compliment_cute},
                {compliment_funny},
                {compliment_hot},
                {compliment_list},
                {compliment_more},
                {compliment_note},
                {compliment_photos},
                {compliment_plain},
                {compliment_profile},
                {compliment_writer},
                {cool},
                {fans},
                {funny},
                {useful},
                {average_stars},
                {review_count});"""
                .format(id=index,
                        name=u.name,
                        yelping_since=str(u.yelping_since),
                        compliment_cool=u.compliment_cool,
                        compliment_cute=u.compliment_cute,
                        compliment_funny=u.compliment_funny,
                        compliment_hot=u.compliment_hot,
                        compliment_list=u.compliment_list,
                        compliment_more=u.compliment_more,
                        compliment_note=u.compliment_note,
                        compliment_photos=u.compliment_photos,
                        compliment_plain=u.compliment_plain,
                        compliment_profile=u.compliment_profile,
                        compliment_writer=u.compliment_writer,
                        cool=u.cool,
                        fans=u.fans,
                        funny=u.funny,
                        useful=u.useful,
                        average_stars=u.average_stars,
                        review_count=u.review_count))
con.commit()

#cur.execute("select * from state")
#print(cur.fetchall())

In [None]:
cur.execute("""select count(*) from "user";""")
print(cur.fetchall())

In [None]:
con.rollback()

In [None]:
# average_stars 	compliment_cool 	compliment_cute 	compliment_funny 	compliment_hot 	compliment_list 	compliment_more 	compliment_note 	compliment_photos 	compliment_plain 	... 	cool 	elite 	fans 	friends 	funny 	name 	review_count 	useful 	user_id 	yelping_since

In [None]:
con.close()
print("Connection closed")