# Data Preprocessing and Exploration

-This notebook initiates the model selection process by exploring the data and preprocessing it for model training and furthur processing.The data used is 2 years sales data of an online retailer in the UK between December 2009 and January 2012.

In [1]:
#Import the neccesary libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

In [5]:
#Import data
#Workbook has two sheets
df1 = pd.read_excel("online_retail_II.xlsx",sheet_name="Year 2009-2010")
df2 = pd.read_excel("online_retail_II.xlsx",sheet_name="Year 2010-2011")
df1.head()

In [6]:
# Concatenate the DataFrames
merged_df = pd.concat([df1, df2], ignore_index=True)
merged_df.head()

In [7]:
#Check the min date and max date of new dataframe
print("Merged max :", merged_df["InvoiceDate"].max())
print("Merged min :", merged_df["InvoiceDate"].min())

In [10]:
#Removed cancelled transactions
merged_df = merged_df[merged_df["Quantity"]>0]

In [11]:
#save the new dataframe to csv
merged_df.to_csv("uk_dataset_full.csv")

In [12]:
#Create date column to allow finding of daily sales volume
merged_df["InvoiceDate"] = pd.to_datetime(merged_df["InvoiceDate"])
merged_df["date"] = merged_df["InvoiceDate"].dt.date

In [13]:
#Group daily sales volume
merged_daily = merged_df.groupby('date')['Quantity'].sum().reset_index()
merged_daily.head()

In [14]:
#save the  daily sales volume
merged_daily.to_csv("uk_daily_sales_full.csv")

### Exploration and Cleaning

In [15]:
#Check the distribution
plt.figure(figsize=(10,5))
plt.plot(merged_daily["date"],merged_daily["Quantity"]);

In [16]:
#Overview of data
merged_daily.info()

In [18]:
#Convert date column to datetime
merged_daily["date"] = pd.to_datetime(merged_daily["date"])

In [17]:
#Descriptive statistics
merged_daily.describe()

#possibility of missing dates and outliers

In [20]:
#Rename quantity column to lowercase
merged_daily.rename(columns={"Quantity":"quantity"},inplace=True)

In [21]:
# Plot to visualise outliers
plt.figure(figsize=(12, 6))
plt.plot(merged_daily['quantity'])
plt.title('Time Series Plot')
plt.show()

# Box plot
plt.figure(figsize=(12, 6))
plt.boxplot(merged_daily['quantity'], vert=False)
plt.title('Box Plot of Quantity')
plt.show();

In [22]:
#Make a copy of the dataframe  
df = merged_daily.copy()


In [23]:
# Set winsorization threshold to convert any number above 60000 to 60000
threshold = 60000

# Winsorize the 'quantity' column
df['quantity_winsorized'] = df['quantity'].where(df['quantity'] <= threshold, threshold)
df.head(5)

In [28]:
plt.figure(figsize=(12, 6))
sns.histplot(df["quantity"])
plt.xlim(0, 50000)
plt.title('Distribution of Quantities (Up to 50,000)')
plt.xlabel('Quantity')
plt.ylabel('Frequency')
plt.show();


In [30]:
#Two years is roughy 730 days so we have missing date, find them and impute
df.shape

In [31]:
def find_date_range(df):
    min_date =df['date'].min()
    max_date = df['date'].max()
    date_range = pd.date_range(start=min_date, end=max_date)
    df = df.set_index('date').reindex(date_range).rename_axis('date').reset_index()
    return  df

In [32]:
daily_sales_volume = find_date_range(df)
daily_sales_volume.shape

In [33]:
daily_sales_volume.head()

In [35]:
daily_sales_volume.isna().sum()

##There are now 135 dates with null values

In [38]:
#Backfill all the null dates to preserve the directon of trend
# Fill any remaining NaN values
daily_sales_volume = daily_sales_volume.fillna(method='bfill')
daily_sales_volume.isna().sum()

In [40]:
# Decompose the time series
decomposition = seasonal_decompose(daily_sales_volume['quantity_winsorized'], model='multiplicative', period=365)
fig = decomposition.plot()
plt.show();

#downward trend

In [42]:
daily_sales_volume = daily_sales_volume[["date","quantity_winsorized"]]
daily_sales_volume.head(15)

In [44]:
daily_sales_volume.rename(columns={"quantity_winsorized":"quantity"},inplace=True)
daily_sales_volume.to_csv("processed.csv")
daily_sales_volume.head()