## 👨‍🔬 Enrichment and transformations

#### 1. Загрузка данных о штрафах

In [102]:
import pandas as pd
import numpy as np
import requests

df_fines = pd.read_json("../ex02/auto.json")
df_fines.set_index("CarNumber", inplace=True)
df_fines.head()

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E432XX77RUS,1,6500.0,Toyota,Camry
7184TT36RUS,1,2100.0,Ford,Focus
X582HE161RUS,2,2000.0,Ford,Focus
X4128H125RUS,1,7458.5,Ford,Focus
H234YH197RUS,2,6000.0,Ford,Focus


#### 2. Настройка отображения чисел

In [103]:
pd.options.display.float_format = '{:.1f}'.format
df_fines.head()


Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E432XX77RUS,1,6500.0,Toyota,Camry
7184TT36RUS,1,2100.0,Ford,Focus
X582HE161RUS,2,2000.0,Ford,Focus
X4128H125RUS,1,7458.5,Ford,Focus
H234YH197RUS,2,6000.0,Ford,Focus


#### 3. Создание выборки из 200 новых записей и Добавление случайных годов

In [104]:
np.random.seed(21)

df_fines["Year"] = np.random.randint(1980, 2020, df_fines.shape[0])

df_sample = df_fines.sample(n=200, replace=True, random_state=21)

df_concat_rows = pd.concat([df_fines, df_sample])

#### 5. Загрузка списка фамилий

In [105]:
file_path = "../../datasets/surname.json"

df_surnames = pd.read_json(file_path)

df_surnames.columns = df_surnames.iloc[0]
df_surnames = df_surnames[1:].reset_index(drop=True)

df_surnames = df_surnames[["NAME"]]
df_surnames.rename(columns={"NAME": "Surname"}, inplace=True)

df_surnames.head()

Unnamed: 0,Surname
0,ADAMS
1,ALLEN
2,ALVAREZ
3,ANDERSON
4,BAILEY


#### 6. Создание базы владельцев машин

In [106]:
unique_cars = df_concat_rows.index.nunique()
top_surnames = df_surnames["Surname"].sample(n=unique_cars, replace=True, random_state=21).values
df_owners = pd.DataFrame({"CarNumber": df_concat_rows.index.unique(), "SURNAME": top_surnames})

df_owners.head()

Unnamed: 0,CarNumber,SURNAME
0,E432XX77RUS,RICHARDSON
1,7184TT36RUS,ROSS
2,X582HE161RUS,MORGAN
3,X4128H125RUS,BAILEY
4,H234YH197RUS,LOPEZ


#### 7. Добавление новых и удаление старых записей

In [107]:
new_fines = pd.DataFrame({
    "CarNumber": ["A111AA777RUS", "B222BB199RUS", "C333CC77RUS", "D444DD97RUS", "E555EE99RUS"],
    "Refund": [1, 2, 1, 2, 1],
    "Fines": [5000, 3000, 7000, 2000, 1000],
    "Make": ["Tesla", "BMW", "Audi", "Mercedes", "Lexus"],
    "Model": ["Model S", "X5", "A6", "E-Class", "RX"],
    "Year": np.random.randint(1980, 2020, 5)
}).set_index("CarNumber")

df_fines = pd.concat([df_fines, new_fines])
df_fines.head()


Unnamed: 0_level_0,Refund,Fines,Make,Model,Year
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E432XX77RUS,1,6500.0,Toyota,Camry,1989
7184TT36RUS,1,2100.0,Ford,Focus,1995
X582HE161RUS,2,2000.0,Ford,Focus,1984
X4128H125RUS,1,7458.5,Ford,Focus,2015
H234YH197RUS,2,6000.0,Ford,Focus,2014


In [108]:
df_owners = df_owners.iloc[:-5]
new_owners = pd.DataFrame({
    "CarNumber": ["X999XX197RUS", "Y888YY77RUS", "Z777ZZ99RUS"],
    "SURNAME": ["SMITH", "JOHNSON", "BROWN"]
})
df_owners = pd.concat([df_owners, new_owners])
df_owners.head()

Unnamed: 0,CarNumber,SURNAME
0,E432XX77RUS,RICHARDSON
1,7184TT36RUS,ROSS
2,X582HE161RUS,MORGAN
3,X4128H125RUS,BAILEY
4,H234YH197RUS,LOPEZ


#### 8. Объединяем данные:

1. Только номера, которые есть в обоих датасетах

In [109]:
df_inner = df_fines.merge(df_owners, on="CarNumber", how="inner")
print(df_inner.head())

      CarNumber  Refund  Fines    Make  Model  Year     SURNAME
0   E432XX77RUS       1 6500.0  Toyota  Camry  1989  RICHARDSON
1   7184TT36RUS       1 2100.0    Ford  Focus  1995        ROSS
2  X582HE161RUS       2 2000.0    Ford  Focus  1984      MORGAN
3  X4128H125RUS       1 7458.5    Ford  Focus  2015      BAILEY
4  H234YH197RUS       2 6000.0    Ford  Focus  2014       LOPEZ


2. Все номера из обоих датасетов

In [110]:
df_outer = df_fines.merge(df_owners, on="CarNumber", how="outer")
print(df_outer.head())

      CarNumber  Refund  Fines  Make  Model   Year SURNAME
0  704687163RUS     2.0 1400.0  Ford  Focus 2003.0    ROSS
1  704787163RUS     2.0 2800.0  Ford  Focus 1996.0   MOORE
2  704987163RUS     2.0 7458.5  Ford  Focus 2019.0    RUIZ
3  705287163RUS     2.0 2000.0  Ford  Focus 2015.0  BROOKS
4  705387163RUS     2.0  700.0  Ford  Focus 1987.0   YOUNG


3. Только номера из штрафов

In [111]:
df_left = df_fines.merge(df_owners, on="CarNumber", how="left")
print(df_left.head())

      CarNumber  Refund  Fines    Make  Model  Year     SURNAME
0   E432XX77RUS       1 6500.0  Toyota  Camry  1989  RICHARDSON
1   7184TT36RUS       1 2100.0    Ford  Focus  1995        ROSS
2  X582HE161RUS       2 2000.0    Ford  Focus  1984      MORGAN
3  X4128H125RUS       1 7458.5    Ford  Focus  2015      BAILEY
4  H234YH197RUS       2 6000.0    Ford  Focus  2014       LOPEZ


4. Только номера из владельцев

In [112]:
df_right = df_fines.merge(df_owners, on="CarNumber", how="right")
print(df_right.head())

      CarNumber  Refund   Fines    Make  Model   Year     SURNAME
0   E432XX77RUS     1.0  6500.0  Toyota  Camry 1989.0  RICHARDSON
1   E432XX77RUS     2.0 13000.0  Toyota  Camry 1983.0  RICHARDSON
2   7184TT36RUS     1.0  2100.0    Ford  Focus 1995.0        ROSS
3  X582HE161RUS     2.0  2000.0    Ford  Focus 1984.0      MORGAN
4  X4128H125RUS     1.0  7458.5    Ford  Focus 2015.0      BAILEY


#### 9. Создаём сводную таблицу и сохраняем результат

In [113]:
df_pivot = df_fines.pivot_table(values="Fines", index=["Make", "Model"], columns="Year", aggfunc="sum")
df_pivot.columns = df_pivot.columns.astype(int)
df_pivot.head()

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Audi,A6,,,,,,,,,7000.0,,...,,,,,,,,,,
BMW,X5,,,,,,,,3000.0,,,...,,,,,,,,,,
Ford,Focus,34400.0,73700.0,57058.5,43058.5,44258.5,72258.5,117758.5,113500.0,142300.0,108658.5,...,65100.0,85800.0,182800.0,260117.1,61917.1,68917.1,66000.0,99858.5,53100.0,47917.1
Ford,Mondeo,,,,,,,,8600.0,,,...,,,,,,1100.0,,,,34400.0
Lexus,RX,,,,,,,,,,,...,,,,,,,,,,


In [114]:
df_pivot.to_csv("pivot_table.csv")
df_fines.to_csv("fines.csv", index=False)
df_owners.to_csv("owners.csv", index=False)