## Data ETL

In [1]:
import pandas as pd

In [2]:
retail_df = pd.read_csv("Resources/retail_price.csv")

retail_df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty,total_price,freight_price,unit_price,product_name_lenght,product_description_lenght,product_photos_qty,...,comp_1,ps1,fp1,comp_2,ps2,fp2,comp_3,ps3,fp3,lag_price
0,bed1,bed_bath_table,01-05-2017,1,45.95,15.1,45.95,39,161,2,...,89.9,3.9,15.011897,215.0,4.4,8.76,45.95,4.0,15.1,45.9
1,bed1,bed_bath_table,01-06-2017,3,137.85,12.933333,45.95,39,161,2,...,89.9,3.9,14.769216,209.0,4.4,21.322,45.95,4.0,12.933333,45.95
2,bed1,bed_bath_table,01-07-2017,6,275.7,14.84,45.95,39,161,2,...,89.9,3.9,13.993833,205.0,4.4,22.195932,45.95,4.0,14.84,45.95
3,bed1,bed_bath_table,01-08-2017,4,183.8,14.2875,45.95,39,161,2,...,89.9,3.9,14.656757,199.509804,4.4,19.412885,45.95,4.0,14.2875,45.95
4,bed1,bed_bath_table,01-09-2017,2,91.9,15.1,45.95,39,161,2,...,89.9,3.9,18.776522,163.39871,4.4,24.324687,45.95,4.0,15.1,45.95


In [3]:
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  676 non-null    object 
 1   product_category_name       676 non-null    object 
 2   month_year                  676 non-null    object 
 3   qty                         676 non-null    int64  
 4   total_price                 676 non-null    float64
 5   freight_price               676 non-null    float64
 6   unit_price                  676 non-null    float64
 7   product_name_lenght         676 non-null    int64  
 8   product_description_lenght  676 non-null    int64  
 9   product_photos_qty          676 non-null    int64  
 10  product_weight_g            676 non-null    int64  
 11  product_score               676 non-null    float64
 12  customers                   676 non-null    int64  
 13  weekday                     676 non

In [4]:
retail_df.isnull().sum()

product_id                    0
product_category_name         0
month_year                    0
qty                           0
total_price                   0
freight_price                 0
unit_price                    0
product_name_lenght           0
product_description_lenght    0
product_photos_qty            0
product_weight_g              0
product_score                 0
customers                     0
weekday                       0
weekend                       0
holiday                       0
month                         0
year                          0
s                             0
volume                        0
comp_1                        0
ps1                           0
fp1                           0
comp_2                        0
ps2                           0
fp2                           0
comp_3                        0
ps3                           0
fp3                           0
lag_price                     0
dtype: int64

In [5]:
# Remove unnecessary columns
retail_df = retail_df.drop(columns=["product_name_lenght", "product_description_lenght", "product_photos_qty", "product_weight_g", "weekday", "weekend", "holiday"])

retail_df.columns

Index(['product_id', 'product_category_name', 'month_year', 'qty',
       'total_price', 'freight_price', 'unit_price', 'product_score',
       'customers', 'month', 'year', 's', 'volume', 'comp_1', 'ps1', 'fp1',
       'comp_2', 'ps2', 'fp2', 'comp_3', 'ps3', 'fp3', 'lag_price'],
      dtype='object')

In [6]:
retail_df = retail_df.rename(columns={"qty":"qty_sold", "product_score":"product_rating", "customers":"no_customers", "s":"seasonality", "comp_1":"comp1_price", "ps1":"comp1_prod_rating", "fp1":"comp1_freight_price", "comp_2":"comp2_price", "ps2":"comp2_prod_rating", "fp2":"comp2_freight_price", "comp_3":"comp3_price", "ps3":"comp3_prod_rating", "fp3":"comp3_freight_price"})

retail_df.head()

Unnamed: 0,product_id,product_category_name,month_year,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,...,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price
0,bed1,bed_bath_table,01-05-2017,1,45.95,15.1,45.95,4.0,57,5,...,89.9,3.9,15.011897,215.0,4.4,8.76,45.95,4.0,15.1,45.9
1,bed1,bed_bath_table,01-06-2017,3,137.85,12.933333,45.95,4.0,61,6,...,89.9,3.9,14.769216,209.0,4.4,21.322,45.95,4.0,12.933333,45.95
2,bed1,bed_bath_table,01-07-2017,6,275.7,14.84,45.95,4.0,123,7,...,89.9,3.9,13.993833,205.0,4.4,22.195932,45.95,4.0,14.84,45.95
3,bed1,bed_bath_table,01-08-2017,4,183.8,14.2875,45.95,4.0,90,8,...,89.9,3.9,14.656757,199.509804,4.4,19.412885,45.95,4.0,14.2875,45.95
4,bed1,bed_bath_table,01-09-2017,2,91.9,15.1,45.95,4.0,54,9,...,89.9,3.9,18.776522,163.39871,4.4,24.324687,45.95,4.0,15.1,45.95


### column_name_explaination: 
"seasonality": fluctuations in sales revenue caused by external factors and occur on a predictable schedule around the same time(s) every year
"comp1_price": product price of competitor 1; 
"comp1_prod_rating": product rating of competitor 1; 
"comp1_freight_price": product freight price of competitor 1
"lag_price": previous-month price of the product

In [7]:
retail_df.columns

Index(['product_id', 'product_category_name', 'month_year', 'qty_sold',
       'total_price', 'freight_price', 'unit_price', 'product_rating',
       'no_customers', 'month', 'year', 'seasonality', 'volume', 'comp1_price',
       'comp1_prod_rating', 'comp1_freight_price', 'comp2_price',
       'comp2_prod_rating', 'comp2_freight_price', 'comp3_price',
       'comp3_prod_rating', 'comp3_freight_price', 'lag_price'],
      dtype='object')

In [8]:
retail_df.shape

(676, 23)

In [9]:
retail_df.to_csv("Resources/retail_price_cleaned.csv", index=False)