In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_excel("../input/online-retail-ii-data-set-from-ml-repository/online_retail_II.xlsx", sheet_name="Year 2010-2011")
df = data.copy()

In [None]:
pd.set_option ('display.float_format', lambda x: '%.2f' % x)

# **DATA PREPARATION**

In [None]:
# First look to dataset
df.head ()

In [None]:
# Columns' datatypes
# There are four categorical, three numerical and one datetime variables in dataset.
# We could also see that there are 541910 observation in this dataset.
df.info()

In [None]:
# How many missing values are there for each columns(variables) in dataset?
df.isnull ().sum ()

In [None]:
# Let's take a quick look to numerical variables in this dataset.
# I will get rid of negative values in "Quantity" and "Price"
df.describe ([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [None]:
df = df[~((df["Quantity"] < 0) | (df["Price"] < 0))]
df.describe()

In [None]:
# Let's check if there is any invoice cancelation.
df[df["Invoice"].str.contains ("C", na=False)]

In [None]:
# Unique number of products

df["Description"].nunique ()

In [None]:
# The most ordered products
df.groupby ("Description")[["Quantity"]].sum ().sort_values (by="Quantity", ascending=False).head ()

In [None]:
# The number of total Invoice
df["Invoice"].nunique ()

In [None]:
# How much money has been earned per invoice?
df["TotalPrice"] = df["Quantity"] * df["Price"]
df.groupby ("Invoice")[["TotalPrice"]].mean ().sort_values (by="TotalPrice", ascending=False)

In [None]:
# The most expensive products
df.sort_values ("Price", ascending=False).head ()

In [None]:
# How many orders per country?
df["Country"].value_counts ().sort_values (ascending=False)

In [None]:
# How much has been made money per country?
df.groupby ("Country")[["TotalPrice"]].sum ().sort_values (by="TotalPrice", ascending=False).head ()

# **CUSTOMER SEGMENTATION**

In [None]:
df.head ()

In [None]:
# The last invoice date
df["InvoiceDate"].max ()

In [None]:
import datetime as dt

# I accepted the date of today as following
today_date = dt.datetime (2011, 12, 11)

In [None]:
# To get Recency, Frequency and Monetary values, I grouped by Customer id and I made operations following
rfm = df.groupby ("Customer ID").agg ({"InvoiceDate": lambda day: (today_date - day.max ()).days,
                                       "Invoice": "nunique",
                                       "TotalPrice": "sum"})

In [None]:
rfm.head()

In [None]:
rfm.columns = ["Recency", "Frequency", "Monetary"]

In [None]:
rfm.head()

In [None]:
# I categorized Recency, Frequency and Monetary according to their values, as numbers from 1 to 5.
# For Recency, the lowest value is ideal. Recency means, the time since last purchase date.
# For Frequency and Monetary, the highest value is ideal.
rfm["Recency_Score"] = pd.qcut (rfm["Recency"], q=5, labels=[5, 4, 3, 2, 1])
rfm["Frequency_Score"] = pd.qcut (rfm["Frequency"].rank(method="first"), q=5, labels=[1, 2, 3, 4, 5])
rfm["Monetary_Score"] = pd.qcut (rfm["Monetary"], q=5, labels=[1, 2, 3, 4, 5])

In [None]:
rfm["RFM_Score"] = rfm["Recency_Score"].astype ("str") + rfm["Frequency_Score"].astype ("str") + rfm[
    "Monetary_Score"].astype ("str")

In [None]:
rfm.head()

In [None]:
# The segmentation of customers by using regex
seg_map = {
    r'[1-2][1-2]': 'Hibernating',
    r'[1-2][3-4]': 'At_Risk',
    r'[1-2]5': 'Cant_Loose',
    r'3[1-2]': 'About_to_Sleep',
    r'33': 'Need_Attention',
    r'[3-4][4-5]': 'Loyal_Customers',
    r'41': 'Promising',
    r'51': 'New_Customers',
    r'[4-5][2-3]': 'Potential_Loyalists',
    r'5[4-5]': 'Champions'
}

rfm["Segment"] = [row[0] + row[1] for row in rfm["RFM_Score"].values]

rfm["Segment"] = rfm["Segment"].replace (seg_map, regex=True)

In [None]:
rfm.head()

In [None]:
seg = rfm.groupby ("Segment").agg (["mean", "count"])

In [None]:
seg.columns = [row[0] + "_" + row[1] if row[1] != "" else row[0] for row in seg.columns]

In [None]:
seg