### Import packages

In [2]:
import pandas as pd
from faker import Faker
from faker.providers import DynamicProvider
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Define helper functions

In [3]:
def add_provider(fake, name, elements):
    prov = DynamicProvider(
        provider_name=name,
        elements=elements,
    )
    #then add new provider to faker instance
    fake.add_provider(prov)

def df_to_fake_dict(df, fake, sample): 
    func_list=[]
    for col in df.columns:
        #escape forbidden characters from column name
        _col = str.replace(col, '.','_')
        #take sample of the unique values of the column
        _values = df[col].unique()[:sample].tolist()
        add_provider(fake, _col, _values)
        func_list.append(eval('fake.'+_col))
    #create list of column names
    col_list = df.columns.to_list()
    #zip column names and functions into dictionary
    fake_dict = dict(zip(col_list, func_list))
    return fake_dict

def gen_df_dict (args, num):
    df=pd.DataFrame()
    for key in args:
        func = args[key]
        df[key]=pd.Series((func() for _ in range(num)))
    return df

### Create Fake objects

In [4]:
#instantiate Faker
fake = Faker()

#read csv into dataframe
df_business = pd.read_csv('../../data/datafiles/yelp_academic_dataset_business_transformed_filtered.csv')
df_review = pd.read_csv('../../data/datafiles/yelp_academic_dataset_review_transformed_filtered.csv')
df_user = pd.read_csv('../../data/datafiles/yelp_academic_dataset_user_transformed_filtered.csv')

#generate dictionary based on structure of dataframe
#each column will be assigned to a fake function
fake_dict_business = df_to_fake_dict(df_business, fake, 10)
fake_dict_review = df_to_fake_dict(df_review, fake, 10)
fake_dict_user = df_to_fake_dict(df_user, fake.unique, 10)

### Generate Fake Data

In [6]:
#generate a new dataframe with structure of source file
fake_df_business = gen_df_dict(fake_dict_business, 10000)
fake_df_review = gen_df_dict(fake_dict_review, 10)
fake_df_user = gen_df_dict(fake_dict_user, 2)

fake_df_business.to_csv('../../data/datafiles/out/faker/yelp_academic_dataset_business_transformed_synthetic.csv')
fake_df_review.to_csv('../../data/datafiles/out/faker/yelp_academic_dataset_review_transformed_synthetic.csv')
fake_df_user.to_csv('../../data/datafiles/out/faker/yelp_academic_dataset_user_transformed_synthetic.csv')

#print generated fake data
print (fake_df_business)
print (fake_df_review)
print (fake_df_user)

              business_id                      name            city  stars  \
0  qkRM_2X51Yqxk3btlwAQIg  Perkiomen Valley Brewery       Nashville    3.0   
1  qkRM_2X51Yqxk3btlwAQIg        St Honore Pastries  St. Petersburg    2.0   
2  CF33F8-E6oudUQ46HnavjQ                    Target  St. Petersburg    4.0   
3  qkRM_2X51Yqxk3btlwAQIg     Tsevi's Pub And Grill      Green Lane    1.5   
4  k0hlBqXX-Bt0vf1op7Jr1w            Sonic Drive-In      Green Lane    5.0   
5  mWMc6_wTdE0EUBKIGXDVfA  Abby Rappoport, LAC, CMQ          Tucson    5.0   
6  mWMc6_wTdE0EUBKIGXDVfA            Sonic Drive-In   Land O' Lakes    3.5   
7  Pns2l4eNsfO8kk83dixA6A        St Honore Pastries    Ashland City    3.5   
8  mWMc6_wTdE0EUBKIGXDVfA            Sonic Drive-In   Land O' Lakes    4.5   
9  mWMc6_wTdE0EUBKIGXDVfA                    Target    Philadelphia    3.0   

   review_count  is_open                categories0  \
0             5        0  Ice Cream & Frozen Yogurt   
1            80        1       