In [21]:
import pandas as pd
import numpy as np

## read the JSON file from ex02

In [22]:
df = pd.read_json('../data/auto.json',
                 orient='records')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2.00,3200.00,Ford,Focus
1,E432XX77RUS,1.00,6500.00,Toyota,Camry
2,7184TT36RUS,1.00,2100.00,Ford,Focus
3,X582HE161RUS,2.00,2000.00,Ford,Focus
4,92918M178RUS,1.00,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2.00,1600.00,Ford,Focus
721,M0309X197RUS,1.00,22300.00,Ford,Focus
722,O673E8197RUS,2.00,600.00,Ford,Focus
723,8610T8154RUS,1.00,2000.00,Ford,Focus


## one of the columns has the float type, so let us define the format of it in pandas using pd.options.display.float_format: floats should be displayed with two decimals

In [23]:
pd.options.display.float_format = '{:,.2f}'.format
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2.00,3200.00,Ford,Focus
1,E432XX77RUS,1.00,6500.00,Toyota,Camry
2,7184TT36RUS,1.00,2100.00,Ford,Focus
3,X582HE161RUS,2.00,2000.00,Ford,Focus
4,92918M178RUS,1.00,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2.00,1600.00,Ford,Focus
721,M0309X197RUS,1.00,22300.00,Ford,Focus
722,O673E8197RUS,2.00,600.00,Ford,Focus
723,8610T8154RUS,1.00,2000.00,Ford,Focus


## A sample with 200 new observations with random_state = 21

1. the sample should not have new combinations of the car number, make and model, so the whole dataset will be consistent in these terms
2. there are no restrictions on the refund and fines, you can take any value from these columns at random and use it towards any car number

In [24]:
sample = df.sample(n=200, random_state=21)
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
445,M0299X197RUS,2.00,19200.00,Ford,Focus
22,83298C154RUS,2.00,8594.60,Ford,Focus
93,H957HY161RUS,1.00,2000.00,Ford,Focus
173,T941CC96RUS,1.00,2000.00,Ford,Focus
697,H966HY161RUS,1.00,500.00,Ford,Focus
...,...,...,...,...,...
14,8182XX154RUS,1.00,200.00,Ford,Focus
623,X796TH96RUS,1.00,500.00,Ford,Focus
498,T011MY163RUS,2.00,4000.00,Ford,Focus
536,T341CC96RUS,2.00,1000.00,Volkswagen,Passat


## concatenate the sample with the initial dataframe to a new dataframe con- cat_rows

In [25]:
concat_rows = pd.concat(objs=[df, sample], ignore_index=True)
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2.00,3200.00,Ford,Focus
1,E432XX77RUS,1.00,6500.00,Toyota,Camry
2,7184TT36RUS,1.00,2100.00,Ford,Focus
3,X582HE161RUS,2.00,2000.00,Ford,Focus
4,92918M178RUS,1.00,5700.00,Ford,Focus
...,...,...,...,...,...
920,8182XX154RUS,1.00,200.00,Ford,Focus
921,X796TH96RUS,1.00,500.00,Ford,Focus
922,T011MY163RUS,2.00,4000.00,Ford,Focus
923,T341CC96RUS,2.00,1000.00,Volkswagen,Passat


## Enrich the dataframe concat_rows by a new column with the data generated

1. create a series with the name Year using random integers from 1980 to 2019
2. use np.random.seed(21) before generating the years
3. concatenate the series with the dataframe and name it fines

In [26]:
np.random.seed = 21
years = pd.Series(np.random.randint(1980, 2019) for i in range (len(concat_rows)))
concat_rows['Year'] = years
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1985
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1985
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1987
3,X582HE161RUS,2.00,2000.00,Ford,Focus,1981
4,92918M178RUS,1.00,5700.00,Ford,Focus,2007
...,...,...,...,...,...,...
920,8182XX154RUS,1.00,200.00,Ford,Focus,2015
921,X796TH96RUS,1.00,500.00,Ford,Focus,1981
922,T011MY163RUS,2.00,4000.00,Ford,Focus,1997
923,T341CC96RUS,2.00,1000.00,Volkswagen,Passat,1982


## Enrich the dataframe with the data from another dataframe

In [27]:
surname = pd.read_json('../data/surname.json', orient='values')
surname.columns = surname.iloc[0]
surname.drop(0, inplace=True)
surname

Unnamed: 0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
5,BAILEY,277845,72
...,...,...,...
96,WILLIAMS,1625252,3
97,WILSON,801882,14
98,WOOD,250715,84
99,WRIGHT,458980,35


create a new dataframe with the car numbers and their owners
get the most popular surnames (you can find the file surname.json in the attachments) in the US
create a new series with the surnames (they should not have special char- acters like commas, brackets, etc.) from the data you gathered, the count should be equal to the number of unique car numbers using the sample (use random_state = 21)
create the dataframe owners with 2 columns: CarNumber and SURNAME

In [28]:
car_numbers = concat_rows.drop_duplicates('CarNumber')['CarNumber']
car_numbers = car_numbers.to_frame(name='CarNumber').reset_index(drop=True)
car_numbers

Unnamed: 0,CarNumber
0,Y163O8161RUS
1,E432XX77RUS
2,7184TT36RUS
3,X582HE161RUS
4,92918M178RUS
...,...
526,O136HO197RUS
527,O22097197RUS
528,M0309X197RUS
529,O673E8197RUS


In [29]:
surnames = surname['NAME'].sample(n=len(car_numbers), random_state=21, replace=True)
surnames = surnames.to_frame(name='SURNAME').reset_index(drop=True)
surnames

Unnamed: 0,SURNAME
0,RICHARDSON
1,ROSS
2,MORGAN
3,BAILEY
4,LOPEZ
...,...
526,CAMPBELL
527,HALL
528,BAKER
529,DIAZ


In [30]:
owners = pd.concat([car_numbers, surnames], axis = 1, ignore_index=False)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
526,O136HO197RUS,CAMPBELL
527,O22097197RUS,HALL
528,M0309X197RUS,BAKER
529,O673E8197RUS,DIAZ


Добавьте еще 5 наблюдений к фрейму данных о штрафах (предложите свои собственные идеи CarNumber и т. д.)

In [31]:
new_entries = pd.DataFrame({
    "CarNumber": ["NEW1", "NEW2", "NEW3", "NEW4", "NEW5"],
    "Refund": [0, 0, 1, 1, 2],
    "Fines": [500, 1000, 750, 1200, 3000],
    "Make": ["Lada", "Toyota", "Chevrolet", "Audi", "Tesla"],
    "Model": ["Granta", "Camry", "Aveo", "A4", "Model S"],
    "Year": [2018, 2019, 2020, 2017, 2016]
})
new_entries


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,NEW1,0,500,Lada,Granta,2018
1,NEW2,0,1000,Toyota,Camry,2019
2,NEW3,1,750,Chevrolet,Aveo,2020
3,NEW4,1,1200,Audi,A4,2017
4,NEW5,2,3000,Tesla,Model S,2016


In [32]:
fines = pd.concat([concat_rows, new_entries], ignore_index=True)
fines


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1985
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1985
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1987
3,X582HE161RUS,2.00,2000.00,Ford,Focus,1981
4,92918M178RUS,1.00,5700.00,Ford,Focus,2007
...,...,...,...,...,...,...
925,NEW1,0.00,500.00,Lada,Granta,2018
926,NEW2,0.00,1000.00,Toyota,Camry,2019
927,NEW3,1.00,750.00,Chevrolet,Aveo,2020
928,NEW4,1.00,1200.00,Audi,A4,2017


delete the dataframe last 20 observations from the owners and add 3 new observations (they are not the same as those you add to the fines dataframe)


In [33]:
owners.drop(owners.tail(20).index, inplace=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
506,T914CT197RUS,HERNANDEZ
507,E41977152RUS,BAKER
508,9464EX178RUS,MARTIN
509,O50197197RUS,WRIGHT


In [34]:
new_owners = pd.DataFrame({
    "CarNumber" : ["NEW6", "NEW7", "NEW8"],
    "SURNAME" : ["Vika", "Tania", "Sasha"]
})
owners = pd.concat([owners, new_owners], ignore_index=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
509,O50197197RUS,WRIGHT
510,7608EE777RUS,HILL
511,NEW6,Vika
512,NEW7,Tania


## join both dataframes:


the new dataframe should have only the car numbers that exist in both dataframes

новый фрейм данных должен содержать только те номера автомобилей, которые существуют в обоих фреймах данных

In [35]:
fines_innerNumbers = pd.merge(left=fines, right=owners, on='CarNumber', how='inner')
fines_innerNumbers

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1985,RICHARDSON
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1985,ROSS
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1987,MORGAN
3,X582HE161RUS,2.00,2000.00,Ford,Focus,1981,BAILEY
4,92918M178RUS,1.00,5700.00,Ford,Focus,2007,LOPEZ
...,...,...,...,...,...,...,...
894,8182XX154RUS,1.00,200.00,Ford,Focus,2015,SMITH
895,X796TH96RUS,1.00,500.00,Ford,Focus,1981,WATSON
896,T011MY163RUS,2.00,4000.00,Ford,Focus,1997,SANDERS
897,T341CC96RUS,2.00,1000.00,Volkswagen,Passat,1982,PEREZ


the new dataframe should have all the car numbers that exist in both dataframes

новый фрейм данных должен содержать все номера автомобилей, которые существуют в обоих фреймах данных

In [36]:
fines_allNumbers = pd.merge(left=fines, right=owners, on='CarNumber', how='outer')
fines_allNumbers

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,704687163RUS,2.00,1400.00,Ford,Focus,2013.00,ADAMS
1,704787163RUS,2.00,2800.00,Ford,Focus,2008.00,MORGAN
2,704987163RUS,2.00,8594.60,Ford,Focus,2004.00,MITCHELL
3,705287163RUS,2.00,2000.00,Ford,Focus,2000.00,GOMEZ
4,705387163RUS,2.00,700.00,Ford,Focus,1993.00,STEWART
...,...,...,...,...,...,...,...
928,Y973O8197RUS,2.00,8594.60,Ford,Focus,2007.00,YOUNG
929,Y973O8197RUS,1.00,34800.00,Ford,Focus,1988.00,YOUNG
930,Y973O8197RUS,1.00,69600.00,Ford,Focus,2009.00,YOUNG
931,Y973O8197RUS,1.00,34800.00,Ford,Focus,2008.00,YOUNG


the new dataframe should have only the car numbers from the fines dataframe

новый фрейм данных должен содержать только номера автомобилей из фрейма данных о штрафах

In [37]:
fines_finesNumbers = pd.merge(left=fines, right=owners, on='CarNumber', how='left')
fines_finesNumbers

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1985,RICHARDSON
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1985,ROSS
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1987,MORGAN
3,X582HE161RUS,2.00,2000.00,Ford,Focus,1981,BAILEY
4,92918M178RUS,1.00,5700.00,Ford,Focus,2007,LOPEZ
...,...,...,...,...,...,...,...
925,NEW1,0.00,500.00,Lada,Granta,2018,
926,NEW2,0.00,1000.00,Toyota,Camry,2019,
927,NEW3,1.00,750.00,Chevrolet,Aveo,2020,
928,NEW4,1.00,1200.00,Audi,A4,2017,


the new dataframe should have only the car numbers from the owners dataframe

новый фрейм данных должен содержать только номера автомобилей из фрейма данных владельцев

In [38]:
fines_ownersNumber = pd.merge(left=fines, right=owners, on='CarNumber', how='right')
fines_ownersNumber

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1985.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,2000.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1985.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2007.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1987.00,MORGAN
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2006.00,HILL
898,7608EE777RUS,1.00,4000.00,Skoda,Octavia,1998.00,HILL
899,NEW6,,,,,,Vika
900,NEW7,,,,,,Tania


In [39]:
pivot_table = fines.pivot_table(
    values="Fines", index="Make", columns="Year", aggfunc="sum", fill_value=0
)
pivot_table

Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
Make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Audi,0.0,0.0,0.0,0.0,0.0,0.0,4200.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1200.0,0.0,0.0,0.0
BMW,3000.0,0.0,0.0,0.0,0.0,0.0,0.0,6500.0,0.0,0.0,...,0.0,8594.6,0.0,0.0,0.0,3000.0,0.0,0.0,0.0,0.0
Chevrolet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,750.0
Ford,268162.2,163189.2,156300.0,91289.2,213778.4,66694.6,182494.6,246883.8,137194.6,163183.8,...,62300.0,118300.0,124894.6,336989.2,186100.0,129383.8,74078.4,217589.2,0.0,0.0
Lada,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,500.0,0.0,0.0
Skoda,0.0,0.0,500.0,5000.0,163094.6,0.0,0.0,8594.6,162594.6,12394.6,...,0.0,12800.0,400.0,17294.6,22800.0,3900.0,0.0,37800.0,0.0,0.0
Tesla,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3000.0,0.0,0.0,0.0,0.0
Toyota,54200.0,7700.0,0.0,30300.0,0.0,7400.0,6600.0,16194.6,4400.0,23600.0,...,7800.0,0.0,7500.0,32394.6,3400.0,12000.0,12700.0,0.0,1000.0,0.0
Volkswagen,1000.0,9900.0,3900.0,6800.0,0.0,10600.0,77000.0,11594.6,0.0,4200.0,...,700.0,24194.6,2500.0,17894.6,9900.0,8794.6,20800.0,35900.0,0.0,0.0
Volvo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
fines.to_csv("../data/fines.csv", index=False)
owners.to_csv("../data/owners.csv", index=False)