In [1]:
### START of Spark initial setup ###

# NOTE: Set up and initialize Spark session here. The rest of the code assumes that pyspark is accessible.

### END of Spark initial setup ###

In [2]:
import numpy as np
import pandas as pd
from numpy import random

from pyspark.sql.types import ArrayType, IntegerType, StringType, DoubleType

import datagen
from datagen import * 
import importlib
importlib.reload(datagen)

import pdb

import warnings
warnings.filterwarnings('ignore')

# Generating a sythetic table of impressions
The following steps show how to generate a synthetic impression table which can be used by the virtual ID assignment algorithm. This is based on a census file that is provided in the repository as `census.csv`.

The process starts with generating a list of people based on a power-law distribution of rates. Here we have used the Lomax (or the Pareto type II) distribution
$$
    L(x) = \frac{\alpha}{m} \left(1 + \frac{x}{m}\right)^{-(\alpha + 1)}
$$
We continue making the table `df_people_n` by assigning a randomly chosen number of impressions per person based on the rate generated in the previous step and then we finaly create `df_impressions` table by assigning random timestamps.

In [3]:
# Load the synthetic census file
census = pd.read_csv("census.csv")
df_census = spark.createDataFrame(census)

In [4]:
demo_cols = ["gender", "ethnicity", "age_range", "education", "income_code"]

# Generate the people from the census. This also assigns a rate to each person.
df_people = gen_people_table(df_census, demo_cols).cache()

In [None]:
population_total = df_census.agg(F.sum("population")).collect()[0][0]

# Add a number of impressions to each person based the approximate total number of impressions provided
df_people_n = add_n_impressions(df_people, I=10**9, population_total=population_total)

In [None]:
df_people_n.show()

In [None]:
start_ts = "2020-03-01 00:00:00"
end_ts   = "2020-07-01 00:00:00"

# Now, create an impression table based on the given number of impression per person using a random
# assignment of timestamp to each impression between `start_ts` and `end_ts`
df_impressions = gen_synthetic_impressions(df_people_n, start_ts, end_ts, demo_cols)

In [None]:
df_impressions.show()