# Format the Dataset

We would like to add a column called 'Revenue' that measures the net revenue useful for assessing the company's performance.  
Revenue = `Quantity × UnitPrice`

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../data/raw/online_retail.csv", parse_dates=True)
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 8:26,3.39,17850.0,United Kingdom


In [3]:
filtered_df = data
filtered_df['Revenue'] = filtered_df['Quantity'] * filtered_df['UnitPrice']
filtered_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 8:26,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 8:26,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 8:26,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 8:26,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 8:26,3.39,17850.0,United Kingdom,20.34


## Replace Country name 'EIRE' with 'Ireland' for readability

In [4]:
filtered_df['Country'] = filtered_df['Country'].replace('EIRE', 'Ireland')
filtered_df['Country'].value_counts().head(5)

Country
United Kingdom    495478
Germany             9495
France              8557
Ireland             8196
Spain               2533
Name: count, dtype: int64

## Convert `InvoiceDate` to Date format

In [5]:
filtered_df['InvoiceDate'] = pd.to_datetime(filtered_df['InvoiceDate']).dt.date

In [6]:
filtered_df['MonthYear'] = pd.to_datetime(filtered_df['InvoiceDate']).dt.strftime('%b-%Y')

In [7]:
# filtered_df['ProductName'] = filtered_df['Description'].str.split().str[:3].str.join(sep=" ").str.title() + "..."
# filtered_df.head()

## Saving preprocessed data into a csv and parquet file

In [8]:
filtered_df.to_csv('../data/processed/processed_data.csv', index=False)

In [9]:
filtered_df.to_parquet('../data/processed/processed_data.parquet')