# Exercise 04: Enrichment and Transformations

In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
# 1. Load data
pd.options.display.float_format = '{:.2f}'.format
df = pd.read_json('../ex02/auto.json', orient='records')
df.head()

In [None]:
# 2. Enrich with sample
sample = df.sample(200, random_state=21)
concat_rows = pd.concat([df, sample])
concat_rows.head()

In [None]:
# 3. Create Year column
np.random.seed(21)
years = np.random.randint(1980, 2020, size=len(concat_rows))
concat_rows['Year'] = years
fines = concat_rows
fines.head()

In [None]:
# 4. Enrich with owners
# Load surnames
with open('../../datasets/surname.json') as f:
    surnames_json = json.load(f)

# Extract surnames from list of lists (skipping header)
# Structure: [["NAME",...], ["ADAMS",...], ...]
surnames = [row[0] for row in surnames_json[1:]]

unique_cars = fines['CarNumber'].unique()
np.random.seed(21)
sampled_surnames = np.random.choice(surnames, size=len(unique_cars))

owners = pd.DataFrame({
    'CarNumber': unique_cars,
    'SURNAME': sampled_surnames
})
owners.head()

In [None]:
# Append 5 observations to fines
new_fines = pd.DataFrame({
    'CarNumber': ['NEW1', 'NEW2', 'NEW3', 'NEW4', 'NEW5'],
    'Refund': [1, 2, 3, 4, 5],
    'Fines': [100, 200, 300, 400, 500],
    'Make': ['NewMake']*5,
    'Model': ['NewModel']*5,
    'Year': [2020]*5
})
fines = pd.concat([fines, new_fines])

# Delete last 20 from owners, add 3 new
owners = owners.iloc[:-20]
new_owners = pd.DataFrame({
    'CarNumber': ['NEW_OWNER1', 'NEW_OWNER2', 'NEW_OWNER3'],
    'SURNAME': ['SURNAME1', 'SURNAME2', 'SURNAME3']
})
owners = pd.concat([owners, new_owners])

# Join
logger_inner = fines.merge(owners, on='CarNumber', how='inner')
logger_outer = fines.merge(owners, on='CarNumber', how='outer')
logger_left = fines.merge(owners, on='CarNumber', how='left')
logger_right = fines.merge(owners, on='CarNumber', how='right')

print(f"Inner: {logger_inner.shape}")
print(f"Outer: {logger_outer.shape}")
print(f"Left: {logger_left.shape}")
print(f"Right: {logger_right.shape}")

In [None]:
# 5. Pivot table
pivot = pd.pivot_table(fines, values='Fines', index=['Make', 'Model'], columns=['Year'], aggfunc=np.sum)
pivot.head()

In [None]:
# 6. Save files
fines.to_csv('fines.csv', index=False)
owners.to_csv('owners.csv', index=False)