In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori,association_rules
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('/content/OnlineRetail.csv', encoding='latin1')
data.head()

In [None]:
data.columns

In [None]:
data.Country.unique()

In [None]:
# Stripping extra spaces in the description
data['Description'] = data['Description'].str.strip()

# Dropping the rows without any invoice number
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

# Dropping all transactions which were done on credit
data = data[~data['InvoiceNo'].str.contains('C')]

In [None]:
# Transactions done in France
basket_France = (data[data['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

# Transactions done in the United Kingdom
basket_UK = (data[data['Country'] =="United Kingdom"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

# Transactions done in Portugal
basket_Por = (data[data['Country'] =="Portugal"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_Sweden = (data[data['Country'] =="Sweden"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [None]:
#Printing dataset of France
basket_France.head

In [None]:
#Specifying the rules
def one_hot_encode(x):
    if x>=1:
        return 1
    else:
        return 0

In [None]:
basket_encoded = basket_France.applymap(one_hot_encode)
basket_France = basket_encoded

basket_encoded = basket_UK.applymap(one_hot_encode)
basket_UK = basket_encoded

basket_encoded = basket_Por.applymap(one_hot_encode)
basket_Por = basket_encoded

basket_encoded = basket_Sweden.applymap(one_hot_encode)
basket_Sweden = basket_encoded

In [None]:
# Building the model
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True)

# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

In [None]:
#Inferred rules for France
frq_items = apriori(basket_Por, min_support = 0.05, use_colnames = True)
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

In [None]:
# Calculate the total sales
data['TotalSales'] = data['Quantity'] * data['UnitPrice']

# Visualize the top countries by total sales without UK
top_countries = data.groupby('Country')['TotalSales'].sum().sort_values(ascending=True).head(-1)
plt.figure(figsize=(14, 6))
sns.barplot(x=top_countries.index, y=top_countries.values)
plt.title('Top Countries by Total Sales Excluding UK')
plt.xlabel('Country')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualize the top countries by total sales with UK
top_countries = data.groupby('Country')['TotalSales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(14, 6))
sns.barplot(x=top_countries.index, y=top_countries.values)
plt.title('Top Countries by Total Sales Including UK')
plt.xlabel('Country')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Convert 'InvoiceDate' to datetime if it's not already
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

# Extract month and year from 'InvoiceDate' column
data['MonthYear'] = data['InvoiceDate'].dt.to_period('M')

# Calculate the monthly revenue
monthly_revenue = data.groupby('MonthYear')['TotalSales'].sum()

# Visualize the monthly revenue trend
plt.figure(figsize=(7.5,7.5))
sns.lineplot(x=monthly_revenue.index.astype(str), y=monthly_revenue.values)
plt.title('Monthly Revenue Trend')
plt.xlabel('Month-Year')
plt.ylabel('Revenue')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualize the product categories
top_categories = data['Description'].value_counts().head(10)
plt.figure(figsize=(7.5,7.5))
sns.barplot(x=top_categories.index, y=top_categories.values)
plt.title('Top Product Categories')
plt.xlabel('Product Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualize the top selling products
top_products = data.groupby('Description')['Quantity'].sum().nlargest(10)
sns.barplot(x=top_products.values, y=top_products.index)

In [None]:
# Visualize transaction count per depending on the hour of day
data['Hour'] = data['InvoiceDate'].dt.hour
transactions_by_hour = data.groupby('Hour')['InvoiceNo'].count()
plt.plot(transactions_by_hour.index, transactions_by_hour.values)

plt.title('Number of Transactions by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Transactions')
plt.show()