## Data import and cleaning

In [35]:
import os
import logging
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Config

#### Working directory

In [48]:
if 'chdir_done' not in globals():
    os.chdir("..")
    chdir_done = True

work_directory = os.getcwd()

#### Logging

In [49]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("./notebook.log"), 
        logging.StreamHandler(sys.stdout)
    ]
)

In [50]:

logging.info('New start notebook')
logging.info(f'Work directory {work_directory}')

2025-01-03 20:44:15,700 - INFO - New start notebook
2025-01-03 20:44:15,702 - INFO - Work directory /home/aleksey/dev/data_analysis/01. Ecommerce


### Settings

### Constants

In [51]:
RAW_FILE = 'data/ecommerce_dataset_updated.csv'

## Import data

In [52]:
df_ecommerce = pd.read_csv(RAW_FILE)

In [53]:
df_ecommerce.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3660 entries, 0 to 3659
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   User_ID           3660 non-null   object 
 1   Product_ID        3660 non-null   object 
 2   Category          3660 non-null   object 
 3   Price (Rs.)       3660 non-null   float64
 4   Discount (%)      3660 non-null   int64  
 5   Final_Price(Rs.)  3660 non-null   float64
 6   Payment_Method    3660 non-null   object 
 7   Purchase_Date     3660 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 228.9+ KB


In [54]:
df_ecommerce.head()

Unnamed: 0,User_ID,Product_ID,Category,Price (Rs.),Discount (%),Final_Price(Rs.),Payment_Method,Purchase_Date
0,337c166f,f414122f-e,Sports,36.53,15,31.05,Net Banking,12-11-2024
1,d38a19bf,fde50f9c-5,Clothing,232.79,20,186.23,Net Banking,09-02-2024
2,d7f5f0b0,0d96fc90-3,Sports,317.02,25,237.76,Credit Card,01-09-2024
3,395d4994,964fc44b-d,Toys,173.19,25,129.89,UPI,01-04-2024
4,a83c145c,d70e2fc6-e,Beauty,244.8,20,195.84,Net Banking,27-09-2024


In [55]:
df_ecommerce["Purchase_Date"] = pd.to_datetime(
    df_ecommerce["Purchase_Date"], format="%d-%m-%Y"
)

In [56]:
df_ecommerce.columns

Index(['User_ID', 'Product_ID', 'Category', 'Price (Rs.)', 'Discount (%)',
       'Final_Price(Rs.)', 'Payment_Method', 'Purchase_Date'],
      dtype='object')

In [57]:
df_ecommerce = df_ecommerce.rename(columns={
    'User_ID': 'user_id',
    'Product_ID': 'product_id',
    'Category': 'category',
    'Price (Rs.)': 'price',
    'Discount (%)': 'discount',
    'Final_Price(Rs.)': 'final_price',
    'Payment_Method': 'payment_method',
    'Purchase_Date': 'purchase_date',
})
logging.info(f'New columns: {df_ecommerce.columns}')

2025-01-03 20:44:18,325 - INFO - New columns: Index(['user_id', 'product_id', 'category', 'price', 'discount', 'final_price',
       'payment_method', 'purchase_date'],
      dtype='object')


In [58]:
df_ecommerce.to_pickle('data/clear_data.pkl')

In [59]:
logging.info('Finish data cleaning')

2025-01-03 20:44:18,572 - INFO - Finish data cleaning
