In [99]:
import pandas as pd
import numpy as np
import pycountry

import faker
from uuid import uuid4

from pathlib import Path

# create a random generator with numpy
rgn = np.random.default_rng(42)

In [100]:
# create locale for France, Germany, Italy, Spain, UK, US
locales = ['fr_FR', 'de_DE', 'it_IT', 'es_ES', 'en_GB', 'en_US']
locales_probs = rgn.dirichlet(np.ones(len(locales)))

In [101]:
# with faker, create a list of 5000 users with the following fields:
# first_name, last_name, email, phone_number, birthdate (max the year 2000), city, country


def create_users():
    users = []
    for _ in range(10000):
        locale = rgn.choice(locales, p=locales_probs)
        country_locale = locale.split('_')[-1]
        country = pycountry.countries.get(alpha_2=country_locale).name
        fake = faker.Faker(locale)
        # a random city in the country
        long, lat, city, c, continent = fake.local_latlng(country_code=country_locale) 

        users.append({
            'id': uuid4().hex,
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'birthdate': fake.date_of_birth(minimum_age=35, maximum_age=60).strftime('%Y-%m-%d'),
            'long': long,
            'lat': lat,
            'city': city,
            'country': country
        })
    return users

users = create_users()
users_df = pd.DataFrame(users)

users_df['birthdate'] = pd.to_datetime(users_df['birthdate'])
users_df.set_index('id', inplace=True)
users_df.head()


Unnamed: 0_level_0,first_name,last_name,birthdate,long,lat,city,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7dc76a90a6ee41a78bf67ada5ad9529c,Benvenuto,Buscetta,1982-08-14,45.50369,11.412,Montecchio Maggiore-Alte Ceccato,Italy
74e048a3ef1c42a986b1a168076d9334,Elvira,Cendron,1966-02-22,45.53069,9.40531,Gorgonzola,Italy
6fa815491c09416b8b2987975a4f583e,Gabriel,Paris,1965-01-10,43.61092,3.87723,Montpellier,France
1c807b5383194da98973034b11c9e1d0,Armin,Schweitzer,1972-08-16,49.68369,8.61839,Bensheim,Germany
13a6c5f168794eaa8e75b0cbe4cba116,Brigitte,Kitzmann,1987-02-15,52.86147,9.5926,Walsrode,Germany


In [102]:
indexes = users_df.index

In [103]:
# create some places with the following fields:
# id, long, lat, city, country
# from a list of exotic countries: Japan, Korea, Thailand, Indonesia, Brazil, Danemark, Sweden, Norway, Finland, Iceland

exotic_country_locales = ['ja_JP', 'ko_KR', 'th_TH', 'id_ID', 'pt_BR', 'da_DK', 'sv_SE', 'no_NO', 'fi_FI']
# create random probabilities for each country
probs = rgn.dirichlet(np.ones(len(exotic_country_locales)))


def create_places():
    places = []
    for _ in range(200):
        locale = rgn.choice(exotic_country_locales, p=probs)
        country_locale = locale.split('_')[-1]
        country = pycountry.countries.get(alpha_2=country_locale).name
        fake = faker.Faker(locale)
        # a random city in the country
        long, lat, city, c, continent = fake.local_latlng(country_code=country_locale)

        continent, state = continent.split('/')

        places.append({
            'id': uuid4().hex,
            'long': long,
            'lat': lat,
            'city': city,
            'country': country,
            'continent': continent
        })
    return places

places = create_places()
places_df = pd.DataFrame(places)
places_df.set_index('id', inplace=True)
places_df.shape
places_df.drop_duplicates(subset=['long', 'lat'], inplace=True)

In [104]:
# convert long and lat to float
places_df['long'] = places_df['long'].astype(float)
places_df['lat'] = places_df['lat'].astype(float)

In [105]:
# for each user, create a list of 1 to 5 places he visited
# with the following fields:
# user_id, place_id, date_arrival, date_departure

# create random probs for places_df indexes
places_probs = probs = rgn.dirichlet(np.ones(len(places_df)))

def create_visits():
    visits = []
    for user_id in indexes:
        for _ in range(rgn.integers(1, 6)):
            place_id = rgn.choice(places_df.index, p=places_probs)
            # difference between the birthdate and the current date in days
            max_date = (pd.Timestamp('2020-12-31') - users_df.loc[user_id, 'birthdate']).days
            # date arrival is the birthdate of the user plus a random number of years between 18 and 2020-12-31
            date_arrival = users_df.loc[user_id, 'birthdate'] + pd.Timedelta(days=rgn.integers(18*365, max_date))
            # date departure is the date arrival plus a random number of days between 3 and 31, max date is 2020-12-31
            date_departure = date_arrival + pd.Timedelta(days=rgn.integers(3, 31))
            visits.append({
                'user_id': user_id,
                'place_id': place_id,
                'date_arrival': date_arrival,
                'date_departure': date_departure
            })
    return visits

visits = create_visits()

In [106]:
visits_df = pd.DataFrame(visits)
visits_df.set_index('user_id', inplace=True)
# convert the date arrival and date departure to datetime
visits_df['date_arrival'] = pd.to_datetime(visits_df['date_arrival'])
visits_df['date_departure'] = pd.to_datetime(visits_df['date_departure'])
visits_df.head()

Unnamed: 0_level_0,place_id,date_arrival,date_departure
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7dc76a90a6ee41a78bf67ada5ad9529c,afab44659c5246f182dec038f9d74e04,2010-10-13,2010-10-21
7dc76a90a6ee41a78bf67ada5ad9529c,196c715bfd5949bea49998e2b692288f,2006-10-30,2006-11-17
74e048a3ef1c42a986b1a168076d9334,2ba656a57b614356abc193bd09d1fe0f,2014-04-04,2014-04-08
74e048a3ef1c42a986b1a168076d9334,354543857b134b52836f1d6d9874740e,2001-04-27,2001-05-20
74e048a3ef1c42a986b1a168076d9334,5695709065db473cbba623ac7ffdb852,1994-04-01,1994-04-20


In [107]:
visits_df.index.value_counts()

user_id
043b069c0712404a8e406a90060cdee0    5
dbd1d28a7a754bda959064a50da52be2    5
56361ace035540ae9deb30f1551c999f    5
dd153924c7e7442aa1232c09be56a022    5
2a146d335e034f259a14d6e23172f071    5
                                   ..
8b9d0e5413bb446eb5ba67e09cdbd0ea    1
e16d41bdf2a04b928c63ae0c197cd033    1
49c74513e2b747e58695eac4777f000b    1
8f07631ff6114076948f842e872c93db    1
646b2b98e29b43c68f19dd0dd6ec4ddd    1
Name: count, Length: 10000, dtype: int64

In [108]:
# save all the data in a directory data, to parquet files
data_dir = Path('data')
data_dir.mkdir(exist_ok=True)

users_df.to_parquet(data_dir / 'users.parquet')
places_df.to_parquet(data_dir / 'places.parquet')
visits_df.to_parquet(data_dir / 'visits.parquet')