#For this project, we will use e-commerce behavior data from a multi-category store, specifically from November 2019, along with the Pandas and Numpy libraries.

In [1]:
import pandas as pd
import numpy as np
import os
data_folder = './'
file_names = ['2019-Oct.csv','2020-Apr.csv','2020-Feb.csv','2020-Jan.csv','2020-Mar.csv','2019-Dec.csv']
all_original_dfs = []


#Here, we are cleaning and making a copy of the data. We then standarize the "event_type," "category_code," "brand," and "user_session" fields by converting them to lowercase and removing any leading or trailing spaces(ETL).

In [2]:
for file_name in file_names:
    file_path = os.path.join(data_folder, file_name)
    try:
        df = pd.read_csv(file_path)
        data_copy = df.copy()
        data_copy['event_time'] = pd.to_datetime(data_copy['event_time'], errors='coerce')
        data_copy['DayOfWeek'] = data_copy['event_time'].dt.day_name()
        data_copy['DayOfMonth'] = data_copy['event_time'].dt.day
        data_copy['MonthName'] = data_copy['event_time'].dt.month_name()
        data_copy['HourOfDay'] = data_copy['event_time'].dt.hour
        for col in ["event_type", "category_code", "brand", "user_session"]:
            data_copy[col] = data_copy[col].str.lower().str.strip()
        data_copy[['Category', 'Product', 'class', 'grade']] = data_copy["category_code"].str.split(".", n=3, expand=True)
        per_hour_and_week = data_copy.groupby(['Category','event_type','MonthName','DayOfMonth','DayOfWeek','HourOfDay']).size().reset_index(name='event_count')
        all_original_dfs.append(per_hour_and_week)
    except FileNotFoundError:
        print(f"  ERROR: Archivo no encontrado en {file_path}. Saltando este archivo.")
    except Exception as e:
        print(f"  ERROR al cargar {file_path}: {e}")

#We split the category_code into four columns because the data is currently delimited by dots. We are doing this to standarize the categories for further analysis

In [3]:
final_per_hour_and_week = pd.concat(all_original_dfs, ignore_index=True)
output_csv_path = 'per_hour_and_week.csv'
final_per_hour_and_week.to_csv(output_csv_path, index=False)
