### Import packages

In [2]:
import pandas as pd
from faker import Faker
from faker.providers import DynamicProvider
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Define helper functions

In [3]:
def add_provider(fake, name, elements):
    prov = DynamicProvider(
        provider_name=name,
        elements=elements,
    )
    #then add new provider to faker instance
    fake.add_provider(prov)

def df_to_fake_dict(df, fake, sample): 
    func_list=[]
    for col in df.columns:
        #escape forbidden characters from column name
        _col = str.replace(col, '.','_')
        #take sample of the unique values of the column
        _values = df[col].unique()[:sample].tolist()
        add_provider(fake, _col, _values)
        func_list.append(eval('fake.'+_col))
    #create list of column names
    col_list = df.columns.to_list()
    #zip column names and functions into dictionary
    fake_dict = dict(zip(col_list, func_list))
    return fake_dict

def gen_df_dict (args, num):
    df=pd.DataFrame()
    for key in args:
        func = args[key]
        df[key]=pd.Series((func() for _ in range(num)))
    return df

### Create Fake objects

In [5]:
#instantiate Faker
fake = Faker()

#read csv into dataframe
df_business = pd.read_csv('../../data/datafiles/yelp_academic_dataset_business_transformed_filtered.csv')
df_review = pd.read_csv('../../data/datafiles/yelp_academic_dataset_review_transformed_filtered.csv')
df_user = pd.read_csv('../../data/datafiles/yelp_academic_dataset_user_transformed_filtered.csv')

#generate dictionary based on structure of dataframe
#each column will be assigned to a fake function
fake_dict_business = df_to_fake_dict(df_business, fake, 10)
fake_dict_review = df_to_fake_dict(df_review, fake, 10)
fake_dict_user = df_to_fake_dict(df_user, fake.unique, 10)

### Generate Fake Data

In [6]:
#generate a new dataframe with structure of source file
fake_df_business = gen_df_dict(fake_dict_business, 10)
fake_df_review = gen_df_dict(fake_dict_review, 10)
fake_df_user = gen_df_dict(fake_dict_user, 2)

#print generated fake data
print (fake_df_business)
print (fake_df_review)
print (fake_df_user)

              business_id                      name            city  stars  \
0  mWMc6_wTdE0EUBKIGXDVfA            Sonic Drive-In    Ashland City    1.0   
1  n_0UpQx1hsNbnPUSlodU8w  Abby Rappoport, LAC, CMQ          Affton    4.5   
2  bBDDEgkFA1Otx9Lfe7BZUQ  Perkiomen Valley Brewery   Land O' Lakes    1.5   
3  tUFrWirKiKi_TAnsVWINQQ            Temple Beth-El      Green Lane    1.5   
4  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries    Ashland City    2.0   
5  MTSW4McQd7CbVtyjqoe9mw                 Marshalls  St. Petersburg    4.5   
6  k0hlBqXX-Bt0vf1op7Jr1w  Abby Rappoport, LAC, CMQ          Affton    1.5   
7  tUFrWirKiKi_TAnsVWINQQ     Tsevi's Pub And Grill       Brentwood    5.0   
8  qkRM_2X51Yqxk3btlwAQIg                    Target       Nashville    1.0   
9  qkRM_2X51Yqxk3btlwAQIg            Temple Beth-El          Tucson    2.0   

   review_count  is_open       categories0   categories1  \
0            28        0           Doctors   Restaurants   
1            22      