In [None]:
import numpy as np
import pandas as pd
import ast
#import psycopg2

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
!ls csv_files

In [None]:
generic = lambda x: ast.literal_eval(x)
conv = {'friends': generic}

In [None]:
business = pd.read_csv('csv_files/yelp_academic_dataset_business.csv')
review = pd.read_csv('csv_files/yelp_academic_dataset_review.csv')
tip = pd.read_csv('csv_files/yelp_academic_dataset_tip_transposed.csv')
user = pd.read_csv('csv_files/yelp_academic_dataset_user.csv',converters=conv)

In [None]:
business.index += 1
review.index += 1
tip.index += 1
user.index += 1

# Parse user

In [None]:
excluded_user = user[(user.name.isna())]

In [None]:
excluded_user

In [None]:
included_user = user[~(user.name.isna())]

In [None]:
included_user.head()

In [None]:
user_ids = included_user['user_id'].reset_index().set_index('user_id')['index'].to_dict()

In [None]:
friends = included_user['friends'].progress_map(lambda friends: list(filter(lambda x: not x is None, map(lambda x: user_ids.get(x,None),friends))))

In [None]:
elite = included_user[included_user['elite'].progress_map(lambda x:type(x))==str]['elite'].map(lambda e:list(map(lambda x:int(x),e.split(","))))

In [None]:
del included_user["user_id"]
del included_user["friends"]
del included_user["elite"]
user_table = included_user.reset_index().rename(columns={'index':'id'})

In [None]:
user_table.head()

In [None]:
user_table.to_csv('generated/user.csv', index=False)

## Parse friends

In [None]:
friends_temp=friends.reset_index().rename(columns={'index':'id'})

validate_query = friends_temp
validate_query["friends"]=validate_query["friends"].map(lambda l : len(l))
validate_query["friends"].max()

In [None]:
chunks = np.array_split(friends_temp, 100000)

processed = []
for chunk in tqdm(chunks):
    processed.append(chunk['friends']
        .apply(lambda x: pd.Series(x))
        .stack()
        .reset_index(level=1, drop=True)
        .to_frame('friends')
        .join(chunk[['id']], how='left')
    )

friends_table = pd.concat(processed)

In [None]:
friends_table["friends"] = friends_table["friends"].astype(int)
friends_table=friends_table.rename(columns={'id':'user_id_1'})
friends_table=friends_table.rename(columns={'friends':'user_id_2'})

In [None]:
friends_table['user_id_1'], friends_table['user_id_2'] = friends_table.min(axis=1), friends_table.max(axis=1)
friends_table.drop_duplicates(inplace=True)

In [None]:
friends_table.head()

In [None]:
friends_table.to_csv('generated/are_friends.csv', index=False)

## Parse Elite years

In [None]:
elite_temp=elite.reset_index().rename(columns={'index':'user_id'})

In [None]:
elite_temp.head()

In [None]:
elite_table=(elite_temp['elite'].progress_apply(lambda x: pd.Series(x))
   .stack()
   .reset_index(level=1, drop=True)
   .to_frame('elite')
   .join(elite_temp[['user_id']], how='left'))

In [None]:
elite_table["elite"] = elite_table["elite"].astype(int)
elite_table=elite_table.rename(columns={'elite':'year'})

In [None]:
elite_table.head()

In [None]:
elite_table.to_csv('generated/elite_years.csv', index=False)

# Parse business

In [None]:
business_ids = business['business_id'].reset_index().set_index('business_id')['index'].to_dict()

# Parse review

In [None]:
review_ids = review['review_id'].reset_index().set_index('review_id')['index'].to_dict()

In [None]:
excluded_review = review[~((review.user_id.isin(user_ids.keys()) & review.business_id.isin(business_ids.keys())))]

In [None]:
included_review = review[review.user_id.isin(user_ids.keys()) & review.business_id.isin(business_ids.keys())]

In [None]:
excluded_review

In [None]:
included_review['user_id'] = included_review["user_id"].progress_map(lambda x: user_ids[x])

In [None]:
included_review['business_id'] = included_review["business_id"].progress_map(lambda x: business_ids[x])

In [None]:
included_review = included_review.reset_index().rename(columns={'index':'id'})

In [None]:
review_table = included_review.astype({"funny":'int', "stars":'int', "useful":'int'})
del review_table["review_id"]

In [None]:
pd.unique(review_table["stars"])
review_table.head()

In [None]:
review_table.to_csv('generated/review.csv', index=False)

# Parse tip

In [None]:
excluded_tip = tip[~((tip.user_id.isin(user_ids.keys()) & tip.business_id.isin(business_ids.keys())))]

In [None]:
included_tip = tip[tip.user_id.isin(user_ids.keys()) & tip.business_id.isin(business_ids.keys())]

In [None]:
included_tip = included_tip.reset_index().rename(columns={'index':'id'})

In [None]:
excluded_tip

In [None]:
included_tip['user_id'] = included_tip["user_id"].progress_map(lambda x: user_ids[x])

In [None]:
included_tip['business_id'] = included_tip["business_id"].progress_map(lambda x: business_ids[x])

In [None]:
tip_table = included_tip

In [None]:
tip_table.head()

In [None]:
tip_table["date"] = tip_table["date"].astype(str)

In [None]:
#only two tip withou text out of 1029045 so we just drop them
tip_table = tip_table.dropna()

In [None]:
tip_table.to_csv('generated/tip.csv', index=False)

In [None]:
friends_table[friends_table["user_id_1"] == friends_table["user_id_2"]]