In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<font color = '#FF7F50'>
    
Content:
    
1. [Customer Relationship Management (CRM)](#1)
    * [What is CRM?](#2)
    * [What is the purpose?](#3)
    * [Content of Dataset](#4)
    * [Attribute Information](#5)
2. [Load Dataset](#6)
3. [Data Preprocessing](#7)
    * [Outliers](#8)
    * [Thresholds](#9)
4. [RFM](#10)
    * [What is RFM?](#11)
    * [RFM Segments](#12)
5. [CLTV Calculate](#13)
    * [What is Customer Lifetime Value(CLV or CLTV)?](#14)
6. [CLTV Predict](#15)
    * [What is a cohort model?](#18)
    * [BG-NBD](#19)
    * [Gamma Gamma](#20)
7. [And Finally CRM](#16)
8. [Final](#17)

<a id = "1"></a><br>
<font color = '#F1C40F'>
# **Customer Relationship Management (CRM)**
<img style="float: margin:1000px 50px 50px 1px; max-width:500px" src="https://previews.123rf.com/images/wrightstudio/wrightstudio1706/wrightstudio170604163/80553736-crm-customer-relationship-management-concept-customer-service-and-relationship-.jpg">


<font color = '#34495E'>
<a id = "2"></a><br>

### **What is CRM?**

Customer relationship management (CRM) is the combination of practices, strategies and technologies that companies use to manage and analyze customer interactions and data throughout the customer lifecycle.CRM targets to gain new customers other than existing customers.With CRM software, the customer feels special, so dependency occurs for the company or product.In this study, we will create CRM by making RFM and then CLTV.

<font color = '#34495E'>
<a id = "3"></a><br>

### **What is the purpose?**

In this notebook, we will create CRM by making RFM and then CLTV.

<font color = '#34495E'>
<a id = "4"></a><br>

### **Content of Dataset**

This Online Retail II data set contains all the transactions occurring for a UK-based and registered, non-store online retail between 01/12/2009 and 09/12/2011.The company mainly sells unique all-occasion gift-ware. Many customers of the company are wholesalers.

<font color = '#34495E'>
<a id = "5"></a><br>

### **Attribute Information:**

**InvoiceNo**: Invoice number. If this code starts with the letter 'c', it indicates a cancellation.

**StockCode**: Product code. Description: Product name. Quantity: The quantities of each product per transaction.

**InvoiceDate**: Invoice date and time.The day and time when a transaction was generated.

**UnitPrice**: Unit price. Product price per unit in sterling.

**CustomerID**: Customer number.A 5-digit integral number uniquely assigned to each customer.

**Country**: Country name. The name of the country where a customer resides.

In [None]:
pip install openpyxl

In [None]:
pip install xlrd

In [None]:
pip install lifetimes

<a id = "6"></a><br>
<font color = '#F1C40F'>

# Load Dataset

In [None]:
# Loading the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
import missingno as msno
from sklearn.preprocessing import MinMaxScaler
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from lifetimes.plotting import plot_period_transactions
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading the online-retail-dataset
data = pd.read_excel("/kaggle/input/online-retail-ii-data-set-from-ml-repository/online_retail_II.xlsx",
                    sheet_name = "Year 2009-2010")

In [None]:
# Copy dataset
df = data.copy()
df.head()

<a id = "7"></a><br>
<font color = '#F1C40F'>
# Data Preprocessing

In [None]:
# Function the checking dataset
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(df)

<a id = "8"></a><br>
<font color = '#D35400'>
### Outliers

**Check out descriptive statistics of numerical variables. See the difference between 75% and 99% values and then See the difference between 99% and max values. We can think that there are some outliers.**

In [None]:
# Missing value visualization
msno.bar(df);

In [None]:
msno.matrix(df);

In [None]:
# There is no specific correlation between missing values
msno.heatmap(df);

In [None]:
# Outlier value visualization
sns.boxplot(df["Quantity"]);

In [None]:
sns.boxplot(df["Price"]);

<a id = "9"></a><br>
<font color = '#D35400'>
### Thresholds

**Outlier values are trimmed (very little) without damaging the data.Here we have set a lower and upper limit. But since the lower limit is set, we'll only assign it to the upper limit. We'll do it for Quantity and Price.**

In [None]:
def outlier_thresholds(dataframe, variable):
    # Quartiles
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    # IQR
    interquantile_range = quartile3 - quartile1
    
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    
    return low_limit, up_limit


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    # Threshold value for upper limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
# outliers values are now cleaner
df.describe([0.01,0.25,0.50,0.75,0.99]).T

<font color = '#D35400'>
    
**Missing values are deleted. Canceled Invoices are not received and and a new variable was created.**

In [None]:
def crm_data_prep(dataframe):
    # Drop the missing values
    dataframe.dropna(axis=0, inplace=True)
    # Removal of returned invoices
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]
    # Those with a quantity value greater than zero
    dataframe = dataframe[dataframe["Quantity"] > 0]
    # Threshold for Quantity and Price
    replace_with_thresholds(dataframe, "Quantity")
    replace_with_thresholds(dataframe, "Price")
    # Multiplying quantity and price for the new column total price
    dataframe["TotalPrice"] = dataframe["Quantity"] * dataframe["Price"]
    return dataframe

df = crm_data_prep(df)

In [None]:
sns.boxplot(df["Quantity"]);

In [None]:
sns.boxplot(df["Price"]);

In [None]:
check_df(df)

<a id = "10"></a><br>
<font color = '#F1C40F'>

# RFM 

<font color = '#34495E'>

**Now, we are creating RFM.**
    
<a id = "11"></a><br>

## **What is RFM?**
<p><img style="float: right;margin:-10px 20px 20px 5px; max-width:380px" src="https://d35fo82fjcw0y8.cloudfront.net/2018/03/01013508/Incontent_image.png"></p>

**<p>RFM represents a method used for measuring customer value. An RFM analysis can show you who are the most valuable customers for your business. The ones who buy most frequently, most often, and spend the most. First of all, the metrics you have seen are calculated**


In [None]:
df["InvoiceDate"].max()

In [None]:
def create_rfm(dataframe):
    # today_date
    today_date = dt.datetime(2010, 12, 11)
    
    # rfm table
    rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': lambda date: (today_date - date.max()).days,
                                                'Invoice': lambda num: num.nunique(),
                                                "TotalPrice": lambda price: price.sum()})
    # rfm columns name
    rfm.columns = ['recency', 'frequency', "monetary"]
    
    rfm = rfm[(rfm['monetary'] > 0)]
    # adding recency score as a new column
    rfm["recency_score"] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])
    # adding frequency score as a new column
    rfm["frequency_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
    # adding rfm_segment as a new column
    rfm['rfm_segment'] = rfm['recency_score'].astype(str) + rfm['frequency_score'].astype(str)
    # naming segments
    seg_map = {
        r'[1-2][1-2]': 'hibernating',
        r'[1-2][3-4]': 'at_risk',
        r'[1-2]5': 'cant_loose',
        r'3[1-2]': 'about_to_sleep',
        r'33': 'need_attention',
        r'[3-4][4-5]': 'loyal_customers',
        r'41': 'promising',
        r'51': 'new_customers',
        r'[4-5][2-3]': 'potential_loyalists',
        r'5[4-5]': 'champions'
    }

    rfm['rfm_segment'] = rfm['rfm_segment'].replace(seg_map, regex=True)
    # rfm columns name
    rfm = rfm[["recency", "frequency", "monetary", "rfm_segment"]]
    return rfm

In [None]:
rfm = create_rfm(df)
rfm.head()

<a id = "12"></a><br>
<font color = '#F1C40F'>

## RFM Segments

<font color = '#34495E'>

**Then segments are created.**

**So why are we doing this?**

**We look for answers to these questions;**

**-Who is our most profitable customer? -What is it they appreciate in my products or services? -Who are my new customers? -How do I attract new customers to the company?**

**The answers to the questions are hidden in the segmentation.**

**With better RFM segmentation, we’ll be able to address certain segments in a personalized manner, based on their needs and preferences.**

**Browse the scheme to more easily understand segmentation.**


![](https://miro.medium.com/max/1234/0*JJBP4ToZiaw0HVPN.png)

In [None]:
# Rfm segments visualization
for col in rfm.columns[0:3]:
    plt.figure(figsize=(15,7))
    sns.barplot(x="rfm_segment",y=col,data=rfm)
    plt.show()

In [None]:
check_df(rfm)

<a id = "13"></a><br>
<font color = '#F1C40F'>

# CLTV Calculate

<a id = "14"></a><br>
<font color = '#34495E'>

<p><img style="float: right;margin:-10px 20px 20px 5px; max-width:380px" src="https://www.surveysensum.com/wp-content/uploads/2019/11/customer-lifetime-Value-SurveySensum.png"></p>

## What is Customer Lifetime Value(CLV or CLTV)?

**Customer lifetime value is how much money a customer will bring your brand throughout their entire time as a paying customer.It is the monetary value that a customer will give to a company during its relationship-communication with a company.**

**In fact, it is to be able to extract the future situation from the current situation of the customer.**

**For this we will first make a simple calculation and then we will add the time factor.**

<font color = '#34495E'>
    

**This time we divided people into A, B, C segments. (Not to be confused with rfm segmentation.) Let's remember we do life-time value calculations.**

**The calculations in the table have been made.**

**Standardization process was done for better understanding**

![](http://)

In [None]:
# CLTV calculate functions

def create_cltv_calculate(dataframe):
    # avg_order_value
    dataframe['avg_order_value'] = dataframe['monetary'] / dataframe['frequency']

    # purchase_frequency
    dataframe["purchase_frequency"] = dataframe['frequency'] / dataframe.shape[0]

    # repeat rate & churn rate
    repeat_rate = dataframe[dataframe.frequency > 1].shape[0] / dataframe.shape[0]
    churn_rate = 1 - repeat_rate

    # profit_margin
    dataframe['profit_margin'] = dataframe['monetary'] * 0.05

    # Customer Value
    dataframe['cv'] = (dataframe['avg_order_value'] * dataframe["purchase_frequency"])

    # Customer Lifetime Value
    dataframe['cltv'] = (dataframe['cv'] / churn_rate) * dataframe['profit_margin']

    # MinMaxScaler
    scaler = MinMaxScaler(feature_range=(1, 100))
    scaler.fit(dataframe[["cltv"]])
    dataframe["cltv_calculated"] = scaler.transform(dataframe[["cltv"]])

    dataframe["cltv_calculated_segment"] = pd.qcut(dataframe["cltv_calculated"], 3, labels=["C", "B", "A"])

    dataframe = dataframe[["recency", "frequency", "monetary", "rfm_segment",
                           "cltv_calculated", "cltv_calculated_segment"]]

    return dataframe

In [None]:
rfm_cltv = create_cltv_calculate(rfm)
check_df(rfm_cltv)

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(x="rfm_segment",y="cltv_calculated",data=rfm_cltv);

In [None]:
plt.figure(figsize=(15,7));
sns.barplot(x="cltv_calculated",y="cltv_calculated_segment",data=rfm_cltv);

In [None]:
rfm_cltv.head()

<a id = "15"></a><br>
<font color = '#F1C40F'>

## CLTV Predict

<a id = "18"></a><br>
<font color = '#34495E'>
 

### **What is a cohort model?**
    
**Instead of simply assuming all the customers to be one group, we can try to split them into multiple groups and calculate the CLTV for each group.**

**Note: recency value customized. (One of the key differences between RFM and CLTV)**

<a id = "19"></a><br>
<font color = '#34495E'>
    
### BG-NBD
**-Beta Geometric Negative Binominal Distribution-**

**In short, expected sales value. Used to estimate how many purchases customers can make over a period of time**

**This method computes the probability that a customer with history (frequency, recency_weekly, T_weekly) is currently alive.(relationship between recency & frequency)**

<a id = "20"></a><br>
<font color = '#34495E'>
    

### Gamma Gamma
**-conditional expected number of purchases up to time-**

**Note1: There should be no correlation between the frequency of transactions and their monetary value.**

**Note2: We are considering only customers who made repeat purchases with the business i.e., frequency > 0. Because, if the frequency is 0, it means that they are a one-time customer and are considered already dead.**

In [None]:
# The correlation seems very weak.

plt.scatter(rfm_cltv.monetary,rfm_cltv.frequency,s=75)

plt.xlabel("monetary")
plt.ylabel("frequency")
plt.legend()
plt.show()

In [None]:
# CLTV predict functions

def create_cltv_predict(dataframe):
    today_date = dt.datetime(2010, 12, 11)

    # recency user specific dynamic
    rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': [lambda date: (date.max()-date.min()).days,
                                                                lambda date: (today_date - date.min()).days],
                                                'Invoice': lambda num: num.nunique(),
                                                'TotalPrice': lambda TotalPrice: TotalPrice.sum()})
    # for column name issue
    rfm.columns = rfm.columns.droplevel(0)

    # recency_cltv_predict
    rfm.columns = ['recency_cltv_predicted', 'T', 'frequency', 'monetary']

    # calculation of monetary_avg
    rfm["monetary"] = rfm["monetary"] / rfm["frequency"]
    rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True)


    # for BG-NBD model calculation of weekly recency and weekly T
    # recency_weekly_cltv_p
    rfm["recency_weekly_cltv_predicted"] = rfm["recency_cltv_predicted"] / 7
    rfm["T_weekly"] = rfm["T"] / 7



    # Control monetary_avg values
    rfm = rfm[rfm["monetary_avg"] > 0]

    # recency filter -for a smoother cltvp account-
    rfm = rfm[(rfm['frequency'] > 1)]

    rfm["frequency"] = rfm["frequency"].astype(int)

    # BG-NBD
    bgf = BetaGeoFitter(penalizer_coef=0.01)
    bgf.fit(rfm['frequency'],
            rfm['recency_weekly_cltv_predicted'],
            rfm['T_weekly'])


    # expected_average_profit
    ggf = GammaGammaFitter(penalizer_coef=0.01)
    ggf.fit(rfm['frequency'],
            rfm['monetary_avg'])
    rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(rfm['frequency'],
                                                                             rfm['monetary_avg'])
    # 6 month cltv_predict
    cltv = ggf.customer_lifetime_value(bgf,
                                       rfm['frequency'],
                                       rfm['recency_weekly_cltv_predicted'],
                                       rfm['T_weekly'],
                                       rfm['monetary_avg'],
                                       time=6,
                                       freq="W",
                                       discount_rate=0.01)

    rfm["cltv_predicted"] = cltv

    # MinMaxScaler
    scaler = MinMaxScaler(feature_range=(1, 100))
    scaler.fit(rfm[["cltv_predicted"]])
    rfm["cltv_predicted"] = scaler.transform(rfm[["cltv_predicted"]])

    # cltv_predict_segments
    rfm["cltv_predicted_segment"] = pd.qcut(rfm["cltv_predicted"], 3, labels=["C", "B", "A"])

    ## recency_cltv_p, recency_weekly_cltv_p
    rfm = rfm[["recency_cltv_predicted", "T", "monetary_avg", "recency_weekly_cltv_predicted", "T_weekly",
              "expected_average_profit",
               "cltv_predicted", "cltv_predicted_segment"]]


    return rfm

In [None]:
rfm_cltv_predicted = create_cltv_predict(df)
check_df(rfm_cltv_predicted)

In [None]:
rfm_cltv_predicted.head()

In [None]:
rfm_cltv_predicted.groupby('cltv_predicted_segment').agg('expected_average_profit').mean().plot(kind='bar', colormap='copper_r');
plt.ylabel("profit");

<a id = "16"></a><br>
<font color = '#F1C40F'>

### And finally CRM!

In [None]:
crm_final = rfm_cltv.merge(rfm_cltv_predicted, on="Customer ID", how="left")
check_df(crm_final)

In [None]:
# will be effective in campaign decisions

crm_final.sort_values(by="monetary_avg", ascending=False).head()

In [None]:
crm_final.sort_values(by="cltv_predicted", ascending=False).head()

<a id = "17"></a><br>
<font color = '#34495E'>

## Final

**So we can analyze each metric comparatively.**

**We can make different decisions for customers in different segments.**

**We can make different campaigns according to the decisions taken.**

**After that, the productivity of the campaign can be measured by making different offers to the masses.**

**A/B test can be used for this.**