# Exercise 04 : Enrichment and transformations

## Imports

In [2]:
import pandas as pd
import numpy as np
import requests

## Read the JSON file that you saved in ex02

In [12]:
df = pd.read_json('../data/auto.json', orient='records')
df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.0,Ford,Focus
1,E432XX77RUS,1,6500.0,Toyota,Camry
2,7184TT36RUS,1,2100.0,Ford,Focus
3,X582HE161RUS,2,2000.0,Ford,Focus
4,92918M178RUS,1,5700.0,Ford,Focus


## One of the columns has the float type, so let us define the format of it in pandas using pd.options.display.float_format: floats should be displayed with two decimals

In [10]:
pd.options.display.float_format = '{:.2f}'.format

In [28]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


## Enrich the dataframe using a sample from that dataframe
- create a sample with 200 new observations with random_state = 21

In [15]:
sample = df.sample(200, random_state=21)
sample.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
445,M0299X197RUS,2,19200.0,Ford,Focus
22,83298C154RUS,2,8594.59,Ford,Focus
93,H957HY161RUS,1,2000.0,Ford,Focus
173,T941CC96RUS,1,2000.0,Ford,Focus
697,H966HY161RUS,1,500.0,Ford,Focus


- the sample should not have new combinations of the car number, make and model, so the whole dataset will be consistent in these terms
- there are no restrictions on the refund and fines, you can take any value from these columns at random and use it towards any car number

In [20]:
sample['Refund'] = np.random.choice(df['Refund'], size=len(sample))
sample['Fines'] = np.random.choice(df['Fines'], size=len(sample))

In [21]:
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
445,M0299X197RUS,1,1500.00,Ford,Focus
22,83298C154RUS,2,4000.00,Ford,Focus
93,H957HY161RUS,1,2200.00,Ford,Focus
173,T941CC96RUS,1,21000.00,Ford,Focus
697,H966HY161RUS,2,6800.00,Ford,Focus
...,...,...,...,...,...
14,8182XX154RUS,1,500.00,Ford,Focus
623,X796TH96RUS,2,500.00,Ford,Focus
498,T011MY163RUS,1,103600.00,Ford,Focus
536,T341CC96RUS,2,46200.00,Volkswagen,Passat


## Concatenate the sample with the initial dataframe to a new dataframe concat_rows

In [26]:
concat_rows = pd.concat([df, sample], ignore_index=True)

In [27]:
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
920,8182XX154RUS,1,500.00,Ford,Focus
921,X796TH96RUS,2,500.00,Ford,Focus
922,T011MY163RUS,1,103600.00,Ford,Focus
923,T341CC96RUS,2,46200.00,Volkswagen,Passat


## Enrich the dataframe concat_rows by a new column with the data generated
- create a series with the name Year using random integers from 1980 to 2019
- use np.random.seed(21) before generating the years

In [37]:
np.random.seed(21)
year = pd.Series(np.random.randint(1980, 2019, size=len(concat_rows)), name='Year')
year

0      1989
1      1995
2      1984
3      2015
4      2014
       ... 
920    1996
921    2002
922    1996
923    2012
924    1984
Name: Year, Length: 925, dtype: int64

## Concatenate the series with the dataframe and name it fines

In [40]:
fines = pd.concat([concat_rows, year], axis='columns')
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1,500.00,Ford,Focus,1996
921,X796TH96RUS,2,500.00,Ford,Focus,2002
922,T011MY163RUS,1,103600.00,Ford,Focus,1996
923,T341CC96RUS,2,46200.00,Volkswagen,Passat,2012
