In [1]:
import pandas as pd
import numpy as np

In [2]:
addresses = pd.read_csv('data/raw/addresses.txt', names=['street', 'city', 'state', 'zip'], dtype=str)
first_names = pd.read_csv('data/raw/first_names.txt', header=None)
last_names = pd.read_csv('data/raw/last_names.txt', header=None)
products = pd.read_csv('data/raw/products.txt', delimiter='\n', header=None)

## Addresses

In [10]:
states = addresses['state']
preprocessed_states = []
for name in states:
    s = name[::-1][1:3][::-1]
    preprocessed_states.append(s)
addresses.loc[:, 'state'] = preprocessed_states

In [12]:
addresses['country'] = 'US'

In [15]:
addresses.head(10)

Unnamed: 0,street,city,state,zip,country
0,1705 W 7th Street Ter,Knob Noster,MO,65336,US
1,45030 Revere St,Callahan,FL,32011,US
2,1127 N 2nd Ave #1,Iron River,MI,49935,US
3,32282 Francise St,White Castle,LA,70788,US
4,1460 Back Valley Rd,Speedwell,TN,37870,US
5,4 Stuyvesant Ave,Rye,NY,10580,US
6,10822 County Q Rd,Downing,WI,54734,US
7,58911 E Main Pr NE,Benton City,WA,99320,US
8,1606 Redwood Dr,Atlantic,IA,50022,US
9,603 14th St,Huntingdon,PA,16652,US


## Names

In [25]:
first_arr = first_names[0].tolist()
last_arr = last_names[0].tolist()

In [43]:
n_users = len(addresses.index)
first = np.random.choice(first_arr, n_users, replace=False)
# last = np.random.choice(last_arr, n_users, replace=False)
user_names = first # list(zip(first, last))

user_types = []
for i in range(n_users):
    if np.random.rand() < 1.0:
        user_types.append('Seller')
    else:
        user_types.append('Buyer')

In [44]:
users_dict = {
    'name' : user_names,
    'street' : addresses['street'],
    'city' : addresses['city'],
    'state' : addresses['state'],
    'zip' : addresses['zip'],
    'country' : addresses['country'],
    'user_type' : user_types
}

In [45]:
pd.DataFrame(users_dict).to_csv('data/users.csv')

## Products

In [34]:
products.head(10)

Unnamed: 0,0
0,Chair
1,Lift chair
2,Bean bag
3,Chaise longue
4,Fauteuil
5,Ottoman
6,Recliner
7,Stool
8,Bar Stool
9,Footstool or ottoman


In [40]:
def random_dimension(n, min_val=10, max_val=100):
    return np.random.randint(max_val - min_val, size=(n,)).astype('float32') + min_val

In [52]:
n_products = 20
products_list = products[0].tolist()[:n_products]

products_dict = {
    'name' : products_list,
    'category' : ['furniture']*n_products,
    'width' : random_dimension(n_products),
    'height' : random_dimension(n_products),
    'length' : random_dimension(n_products),
    'weight' : random_dimension(n_products, min_val=10, max_val=20)
}

In [53]:
pd.DataFrame(products_dict).to_csv('data/products.csv')

## Items

In [60]:
items_list = []
base_price = 10

for i, (name, user_type) in enumerate(zip(user_names, user_types)):
    if user_type == 'Seller':
        n_products_per_seller = np.random.randint(3) + 2
        seller_products = np.random.choice(products_list, n_products_per_seller, replace=False)
        
        for prod in seller_products:
            price = np.round(base_price + np.random.rand() * 50, decimals=2)
            item = (prod, name, price)
            items_list.append(item)

In [63]:
items = list(zip(*items_list))

In [66]:
items_dict = {
    'product' : items[0],
    'seller' : items[1],
    'price' : items[2]
}

In [67]:
pd.DataFrame(items_dict).to_csv('data/items.csv')