In [1]:
# Import the necessary library
# The requests module allows you to send HTTP requests using Python.
#The HTTP request returns a Response Object with all the response data (content, encoding, status, etc).

import requests

In [2]:
# Define the URL of the dataset file
# You need to get the direct download link. Right-click on the file in the UCI data folder and "Copy link address"
# Note: UCI links can be tricky. Sometimes it's easier to manually download. For this guide, let's assume we have a direct link.
# Example URL (this might not be the actual UCI link; please find the correct one):

dataset_url_2010 = "https://archive.ics.uci.edu/dataset/502/online+retail+ii"

In [5]:
# Send a GET request to download the file
response = requests.get(dataset_url_2010)
response

<Response [200]>

In [7]:
# Check if the request was successful (status code 200)
if response.status_code == 200:
    # will open the online_retail_II.xlsx file for writing in binary mode.
    with open ('online_retail_II.xlsx', 'wb') as file:
        file.write(response.content)
    print("File downloaded successfully")
else: 
    print(f"Failed to download file. Status code: {response.status_code}")
        

File downloaded successfully


# Initial Data Loading and Inspection

In [1]:
import pandas as pd

In [2]:
import os
os.getcwd()

'C:\\Users\\ssaja\\OneDrive\\Desktop\\Courses & Project\\Projects\\E-Commerce Customer Segmentation project'

In [3]:
# Install openpyxl in your terminal or notebook cell
# !pip install openpyxl

# Load the Excel file into a DataFrame
# Note: The dataset might be on a specific sheet. You can specify it with `sheet_name=' sheet name'`
df = pd.read_excel('online_retail_II.xlsx')


In [4]:
df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
525456,538171,22271,FELTCRAFT DOLL ROSIE,2,2010-12-09 20:01:00,2.95,17530.0,United Kingdom
525457,538171,22750,FELTCRAFT PRINCESS LOLA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525458,538171,22751,FELTCRAFT PRINCESS OLIVIA DOLL,1,2010-12-09 20:01:00,3.75,17530.0,United Kingdom
525459,538171,20970,PINK FLORAL FELTCRAFT SHOULDER BAG,2,2010-12-09 20:01:00,3.75,17530.0,United Kingdom


In [5]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [11]:
# Load the Excel file into a DataFrame
# Note: The dataset might be on a specific sheet. You can specify it with `sheet_name=' sheet name'`

df = pd.read_excel('online_retail_II.xlsx', sheet_name='Year 2010-2011')
print("\nFirst 5 rows:")
df.head()


First 5 rows:


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [10]:
# Do a quick initial inspection
print("Dataset Shape:", df.shape) # Shows (rows, columns)

Dataset Shape: (541910, 8)


In [None]:
print("\nColumn Data Types and Non-Null Counts:")
df.info() # Crucial: shows data types and missing values!

In [None]:
print("\nBasic Descriptive Statistics:")
df.describe() # For numeric columns