### Import Packages

In [1]:
import logging
import random
import csv
import pandas as pd
from faker import Faker
from trumania.core import circus, operations
from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator
import trumania.core.util_functions as util_functions

### Creating a Trumania Circus

In [2]:
example_circus = circus.Circus(name="Review", 
                               master_seed=12345,
                               start=pd.Timestamp("1 Jan 2023 00:00"),
                               step_duration=pd.Timedelta("1h"))

### Creating a Person entity

In [3]:
# define generators for the different data types.
id_gen = FakerGenerator(method="pystr", min_chars=22, max_chars=22, seed=next(example_circus.seeder))
name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder))
review_count_gen = FakerGenerator(method="pyint", min_value=0, max_value=10, seed=next(example_circus.seeder))
date_gen = FakerGenerator(method="date", seed=next(example_circus.seeder))

# create a population and assign a generator to each attribute
person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen)
person.create_attribute("name", init_gen=name_gen)
person.create_attribute("review_count", init_gen=review_count_gen)
person.create_attribute("yelping_since", init_gen=date_gen)

<trumania.core.attribute.Attribute at 0x7fce83e84850>

### Create a Business entity

In [6]:
# the default 'city' Faker generates other cities than the original file
# therefore infer a 'city' list based on the original file
df_business = pd.read_csv('../../data/datafiles/yelp_academic_dataset_business_transformed_filtered.csv')
city_values = random.choices(df_business["city"].unique().tolist(), k=10000)

# define generators for the different data types.
company_gen = FakerGenerator(method="company", seed=next(example_circus.seeder))
stars_gen = FakerGenerator(method="pydecimal", min_value=0, max_value=5, right_digits=0, seed=next(example_circus.seeder))
is_open_gen = FakerGenerator(method="pyint", min_value=0, max_value=1, seed=next(example_circus.seeder))

# create a population and assign a generator to each attribute
business = example_circus.create_population(name="business", size=10000, ids_gen=id_gen)
business.create_attribute("name", init_gen=company_gen)
business.create_attribute("city",init_values=city_values)
business.create_attribute("stars", init_gen=stars_gen)
business.create_attribute("review_count", init_gen=review_count_gen)
business.create_attribute("is_open", init_gen=is_open_gen)
business.create_attribute("categories0", init_gen=name_gen)
business.create_attribute("categories1", init_gen=name_gen)
business.create_attribute("categories2", init_gen=name_gen)
business.create_attribute("categories3", init_gen=name_gen)
business.create_attribute("categories4", init_gen=name_gen)

<trumania.core.attribute.Attribute at 0x7fce83e67670>

### Define Story

In [8]:
# define a story that is initiated by a population.
# in this case a review is initiated by a 'person'.
review_visit = example_circus.create_story(
    name="review_visit",
    initiating_population=example_circus.populations["person"],
    member_id_field="user_id",
    timer_gen=ConstantDependentGenerator(value=1)
)

# add an operation that will be executed at every clock step.
# in this case, create a review for a business by a person.
review_visit.set_operations(
    # generate an unique id for every review
    id_gen.ops.generate(named_as="review_id"),
    # pick a random business
    example_circus.populations["business"].ops.select_one(named_as="business_id"),
    # generate a score between 0 and 5
    FakerGenerator(method="pyint", min_value=0, max_value=5, seed=next(example_circus.seeder)).ops.generate(named_as="stars"),
    FakerGenerator(method="pyint", min_value=0, max_value=5, seed=next(example_circus.seeder)).ops.generate(named_as="useful"),
    FakerGenerator(method="pyint", min_value=0, max_value=5, seed=next(example_circus.seeder)).ops.generate(named_as="funny"),                                                                                                            
    FakerGenerator(method="pyint", min_value=0, max_value=5, seed=next(example_circus.seeder)).ops.generate(named_as="cool"),
    # generate a timestamp between 2015 and 2023 (see https://www.unixtimestamp.com/)
    FakerGenerator(method="unix_time", end_datetime=1674658816, start_datetime=1434898086, seed=next(example_circus.seeder)).ops.generate(named_as="date"),                                                                                                            

    # define the output
    operations.FieldLogger(log_id="reviews", cols=["review_id","user_id", "business_id","stars","useful","funny","cool","date" ])
 )

ValueError: Cannot add story review_visit: another story with identical name is already in the circus

### Run the Circus

In [9]:
# run the circus for 48h and save the output
example_circus.run(
    duration=pd.Timedelta("48h"),
    log_output_folder="../../data/datafiles/out/trumania",
    delete_existing_logs=True
)

# convert the entities to dataframes
person_df = person.to_dataframe()
business_df = business.to_dataframe()

# save the entities
person_df.to_csv("../../data/datafiles/out/trumania/user.csv", index_label='user_id')
business_df.to_csv("../../data/datafiles/out/trumania/business.csv", index_label='business_id',quoting=csv.QUOTE_NONNUMERIC)

  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
  self.forced_to_act_next = pd.Series()
