# Chapter 9 - Data Science
## Data Preparation

## 0 - Setting up the notebook

In [1]:
import json
import random
from datetime import date, timedelta

import faker

## 1 - Preparing the Data

In [2]:
# create the faker to populate the data
fake = faker.Faker()

In [3]:
usernames = set()
usernames_no = 1000

# populate the set with 1000 unique usernames
while len(usernames) < usernames_no:
    usernames.add(fake.user_name())

In [4]:
def get_random_name_and_gender():
    skew = .6  # 60% of users will be female
    male = random.random() > skew
    if male:
        return fake.name_male(), 'M'
    else:
        return fake.name_female(), 'F'

# for each username, create a complete user profile
# simulate user data coming from an API. It is a list
# of JSON strings (users).
def get_users(usernames):
    users = []
    for username in usernames:
        name, gender = get_random_name_and_gender()
        user = {
            'username': username,
            'name': name,
            'gender': gender,
            'email': fake.email(),
            'age': fake.random_int(min=18, max=90),
            'address': fake.address(),
        }
        users.append(json.dumps(user))
    return users

users = get_users(usernames)
users[:3]

['{"username": "susan42", "name": "Emily Smith", "gender": "F", "email": "vmckinney@leon.com", "age": 53, "address": "66537 Riley Mission Apt. 337\\nNorth Jennifer, NH 95781"}',
 '{"username": "sarahcarpenter", "name": "Michael Kane", "gender": "M", "email": "tamara51@yahoo.com", "age": 58, "address": "7129 Patrick Walks Suite 215\\nLaurenside, LA 97179"}',
 '{"username": "kevin37", "name": "Nathaniel Miller", "gender": "M", "email": "maria21@gmail.com", "age": 36, "address": "8247 Manning Burgs Suite 806\\nLopezshire, MS 06606"}']

In [5]:
# campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency
def get_type():
    # just some gibberish internal codes
    types = ['AKX', 'BYU', 'GRZ', 'KTR']
    return random.choice(types)

def get_start_end_dates():
    duration = random.randint(1, 2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days=offset)
    end = start + timedelta(days=duration)
    
    def _format_date(date_):
        return date_.strftime("%Y%m%d")
    
    return _format_date(start), _format_date(end)

def get_age():
    age = random.randrange(20, 46, 5)
    diff = random.randrange(5, 26, 5)
    return '{}-{}'.format(age, age + diff)

def get_gender():
    return random.choice(('M', 'F', 'B'))

def get_currency():
    return random.choice(('GBP', 'EUR', 'USD'))

def get_campaign_name():
    separator = '_'
    type_ = get_type()
    start, end = get_start_end_dates()
    age = get_age()
    gender = get_gender()
    currency = get_currency()
    return separator.join(
        (type_, start, end, age, gender, currency))

In [6]:
# campaign data:
# name, budget, spent, clicks, impressions
def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)    
    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))    
    impressions = int(random.gauss(0.5 * 10**6, 2))
    return {
        'cmp_name': name,
        'cmp_bgt': budget,
        'cmp_spent': spent,
        'cmp_clicks': clicks,
        'cmp_impr': impressions
    }

In [7]:
# assemble the logic to get the final version of the rough data
# data will be a list of dictionaries. Each dictionary will follow
# this structure:
# {'user': user_json, 'campaigns': [c1, c2, ...]}
# where user_json is the JSON string version of a user data dict
# and c1, c2, ... are campaign dicts as returned by
# get_campaign_data

def get_data(users):
    data = []
    for user in users:
        campaigns = [get_campaign_data()
                     for _ in range(random.randint(2, 8))]
        data.append({'user': user, 'campaigns': campaigns})
    return data

## 2 - Cleaning the data

In [8]:
# fetch simulated rough data
rough_data = get_data(users)

rough_data[:2]  # let's take a peek

[{'user': '{"username": "susan42", "name": "Emily Smith", "gender": "F", "email": "vmckinney@leon.com", "age": 53, "address": "66537 Riley Mission Apt. 337\\nNorth Jennifer, NH 95781"}',
  'campaigns': [{'cmp_name': 'GRZ_20210131_20210411_30-40_F_GBP',
    'cmp_bgt': 253951,
    'cmp_spent': 17953,
    'cmp_clicks': 52573,
    'cmp_impr': 500001},
   {'cmp_name': 'BYU_20210109_20221204_30-35_M_GBP',
    'cmp_bgt': 150314,
    'cmp_spent': 125884,
    'cmp_clicks': 24575,
    'cmp_impr': 499999},
   {'cmp_name': 'GRZ_20211124_20220921_20-35_B_EUR',
    'cmp_bgt': 791397,
    'cmp_spent': 480963,
    'cmp_clicks': 39668,
    'cmp_impr': 499999},
   {'cmp_name': 'GRZ_20210727_20220211_35-45_B_EUR',
    'cmp_bgt': 910204,
    'cmp_spent': 339997,
    'cmp_clicks': 16698,
    'cmp_impr': 500000},
   {'cmp_name': 'BYU_20220216_20220407_20-25_F_EUR',
    'cmp_bgt': 393134,
    'cmp_spent': 158930,
    'cmp_clicks': 46631,
    'cmp_impr': 500000}]},
 {'user': '{"username": "sarahcarpenter", "n

In [9]:
# Let's start from having a different version of the data
# I want a list whose items will be dicts. Each dict is 
# the original campaign dict plus the user JSON

data = []
for datum in rough_data:
    for campaign in datum['campaigns']:
        campaign.update({'user': datum['user']})
        data.append(campaign)
data[:2]  # let's take another peek

[{'cmp_name': 'GRZ_20210131_20210411_30-40_F_GBP',
  'cmp_bgt': 253951,
  'cmp_spent': 17953,
  'cmp_clicks': 52573,
  'cmp_impr': 500001,
  'user': '{"username": "susan42", "name": "Emily Smith", "gender": "F", "email": "vmckinney@leon.com", "age": 53, "address": "66537 Riley Mission Apt. 337\\nNorth Jennifer, NH 95781"}'},
 {'cmp_name': 'BYU_20210109_20221204_30-35_M_GBP',
  'cmp_bgt': 150314,
  'cmp_spent': 125884,
  'cmp_clicks': 24575,
  'cmp_impr': 499999,
  'user': '{"username": "susan42", "name": "Emily Smith", "gender": "F", "email": "vmckinney@leon.com", "age": 53, "address": "66537 Riley Mission Apt. 337\\nNorth Jennifer, NH 95781"}'}]

In [10]:
# Warning: Uncommenting and executing this cell will overwrite data.json
#with open('data.json', 'w') as stream:
#     stream.write(json.dumps(data))