## Importing necessary dependencies and data

In [136]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objs as go 

import warnings
warnings.filterwarnings("ignore")

from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
import xgboost as xgb
import time
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV

In [3]:
#Import cleaned data

df_data = pd.read_csv('data_cleaned.csv')

df_data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Year,Month,Total_cost
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,2010,12,15.30
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010,12,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,2010,12,22.00
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010,12,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010,12,20.34
...,...,...,...,...,...,...,...,...,...,...,...
392727,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,2011,12,10.20
392728,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,2011,12,12.60
392729,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,2011,12,16.60
392730,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,2011,12,16.60


### Useful functions for the process

Here are some of the functions that will be used in the analysis and prediction. 

In [71]:
### For K-means clustering ###

def order_cluster(df, target_field_name, cluster_field_name, ascending):
    """
    Sorts clusters based on the mean of a target field and reorders the original DataFrame accordingly.

    Parameters:
    - df: pandas DataFrame
      The input DataFrame containing the data.
    - target_field_name: str
      The name of the column for which the mean value will be computed for each cluster.
    - cluster_field_name: str
      The name of the column used for clustering. This column will be used to group the data.
    - ascending: bool
      Determines the sorting order of the target field's mean values. If True, sorts in ascending order; otherwise, sorts in descending order.

    Returns:
    - df_final: pandas DataFrame
      A DataFrame where the original clusters have been reordered based on the mean values of the target field. 
      The DataFrame will contain the original data with the cluster column renamed and sorted according to the mean values.
    """
    # Add the string "new_" to cluster_field_name
    new_cluster_field_name = "new_" + cluster_field_name
    
    # Create a new DataFrame by grouping the input DataFrame by cluster_field_name and calculating the mean of target_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    
    # Sort the new DataFrame by the mean of target_field_name
    df_new = df_new.sort_values(by=target_field_name, ascending=ascending).reset_index(drop=True)
    
    # Add an 'index' column to df_new with values representing the order of clusters
    df_new["index"] = df_new.index
    
    # Merge the original DataFrame with the sorted DataFrame based on cluster_field_name
    df_final = pd.merge(df, df_new[[cluster_field_name, "index"]], on=cluster_field_name)
    
    # Drop the original cluster_field_name column
    df_final = df_final.drop([cluster_field_name], axis=1)
    
    # Rename the 'index' column to cluster_field_name
    df_final = df_final.rename(columns={"index": cluster_field_name})
    
    return df_final


### For RFM feature comparison ###

def plot_scatter(df, x_feature, y_feature, segment_col='Segment'):
    """
    Creates a scatter plot comparing two features with different segments.

    Parameters:
    - df: DataFrame containing the data.
    - x_feature: String name of the feature for the x-axis.
    - y_feature: String name of the feature for the y-axis.
    - segment_col: String name of the column used for segmentation (default is 'Segment').

    Returns:
    - Plotly Figure object of the scatter plot.
    """
    
    # Filter out extreme values if needed
    filtered_df = df.query("Monetary < 50000 and Frequency < 2000")
    
    # Create plot data for each segment
    plot_data = [
        go.Scatter(
            x=filtered_df.query(f"{segment_col} == 'Low-Value'")[x_feature],
            y=filtered_df.query(f"{segment_col} == 'Low-Value'")[y_feature],
            mode='markers',
            name='Low-Value',
            marker=dict(
                size=7,
                line=dict(width=1),
                color='blue',
                opacity=0.8
            )
        ),
        
        go.Scatter(
            x=filtered_df.query(f"{segment_col} == 'Mid-Value'")[x_feature],
            y=filtered_df.query(f"{segment_col} == 'Mid-Value'")[y_feature],
            mode='markers',
            name='Mid-Value',
            marker=dict(
                size=9,
                line=dict(width=1),
                color='green',
                opacity=0.5
            )
        ),
        
        go.Scatter(
            x=filtered_df.query(f"{segment_col} == 'High-Value'")[x_feature],
            y=filtered_df.query(f"{segment_col} == 'High-Value'")[y_feature],
            mode='markers',
            name='High-Value',
            marker=dict(
                size=11,
                line=dict(width=1),
                color='red',
                opacity=0.9
            )
        ),
    ]

    # Define layout
    plot_layout = go.Layout(
        yaxis={'title': y_feature},
        xaxis={'title': x_feature},
        title='Segments', 
        title_x=0.5, 
        template="plotly_dark"
    )

    # Create and return the figure
    fig = go.Figure(
        data=plot_data, 
        layout=plot_layout
    )
    
    return fig


## Prediction of Customer Purchase 

The goal is to estimate if a given customer will buy something again from the online shop in the next 30 days. 

## RFM segmentation and K-means clustering

To proceed with machine learning models it's necessary to look at the behavior of Recency - Frequency - Monetary Value segmentation features. 
* Recency: Customers purchase behaviour based on their most recent purchase date and how many days they have been inactive since their last purchase.

* Frequency: Customers purchase behaviour based on the number of times they buy from the online retail shop.

* Monetary Value/Revenue: Customers purchase behaviour based the revenue they generate.

After that, I will apply K-means clustering to assign customers a score to each of the features.

In [24]:
df_data['InvoiceDate'] = pd.to_datetime(df_data['InvoiceDate']) #making sure the format is datetime

### Recency

Most recent purchase date of each customer and how many days they have been inactive.

In [97]:
rfm_train = df_data

current_date = dt.date(2011,12,9)
rfm_train['Purchase_Date'] = rfm_train.InvoiceDate.dt.date

recency = rfm_train.groupby('CustomerID')['Purchase_Date'].max().reset_index()
recency = recency.assign(Current_Date = current_date)

# Compute the number of days since last purchase
recency['Recency'] = recency.Purchase_Date.apply(lambda x: (current_date - x).days)

recency

Unnamed: 0,CustomerID,Purchase_Date,Current_Date,Recency
0,12346.0,2011-01-18,2011-12-09,325
1,12347.0,2011-12-07,2011-12-09,2
2,12348.0,2011-09-25,2011-12-09,75
3,12349.0,2011-11-21,2011-12-09,18
4,12350.0,2011-02-02,2011-12-09,310
...,...,...,...,...
4334,18280.0,2011-03-07,2011-12-09,277
4335,18281.0,2011-06-12,2011-12-09,180
4336,18282.0,2011-12-02,2011-12-09,7
4337,18283.0,2011-12-06,2011-12-09,3


In [98]:
#Using all the customers (non split dataset)
ctm_dt = pd.DataFrame(df_data['CustomerID'].unique())
ctm_dt.columns = ['CustomerID']


ctm_dt = pd.merge(ctm_dt, recency[['CustomerID', 'Recency']], on='CustomerID')
ctm_dt

Unnamed: 0,CustomerID,Recency
0,17850.0,372
1,13047.0,31
2,12583.0,2
3,13748.0,95
4,15100.0,333
...,...,...
4334,13436.0,1
4335,15520.0,1
4336,13298.0,1
4337,14569.0,1


In [99]:
 pd.DataFrame(ctm_dt.Recency.describe())

Unnamed: 0,Recency
count,4339.0
mean,92.041484
std,100.007757
min,0.0
25%,17.0
50%,50.0
75%,141.5
max,373.0


In [100]:
hist_fig = px.histogram(ctm_dt, 
                        x="Recency",
                        nbins=30,
                        title="Customers Recency in Days", 
                        template= "plotly_dark" 
                       )

hist_fig.update_layout(title_x=0.5, 
                       xaxis_title="Recency in groups of 30 days", 
                       yaxis_title="Number of Customers"
                      )

hist_fig.show(config={'displaylogo': False})

Next I will apply K-means clustering to assign a recency score. For that it's necessary to know how many clusters in order to use the K-means algorithm. I'll use Elbow Method to determine them.

In [101]:
my_dict={}
ctm_recency = ctm_dt[['Recency']]
for idx in range(1, 10):
    kmeans = KMeans(n_clusters=idx, max_iter=1000).fit(ctm_recency)
    ctm_recency["clusters"] = kmeans.labels_
    my_dict[idx] = kmeans.inertia_ 

line_fig = px.line(x=list(my_dict.keys()), 
                   y=list(my_dict.values()), 
                   template="plotly_dark"
                  )

line_fig.update_layout(title_x=0, 
                       xaxis_title="Number of cluster", 
                       yaxis_title=""
                      )

line_fig.show(config={'displaylogo': False})

From the Figure above, 4 seem to be the optimal one.

In [102]:
number_of_clusters = 4
kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.fit(ctm_dt[['Recency']])
ctm_dt['RecencyCluster'] = kmeans.predict(ctm_dt[['Recency']])
ctm_dt.head()


Unnamed: 0,CustomerID,Recency,RecencyCluster
0,17850.0,372,1
1,13047.0,31,0
2,12583.0,2,0
3,13748.0,95,3
4,15100.0,333,1


In [103]:
#Using the order_cluster function to sort by cluster number
ctm_dt = order_cluster(ctm_dt, 'Recency', 'RecencyCluster', False)
ctm_dt

Unnamed: 0,CustomerID,Recency,RecencyCluster
0,17850.0,372,0
1,15100.0,333,0
2,18074.0,373,0
3,16250.0,261,0
4,13747.0,373,0
...,...,...,...
4334,14259.0,141,1
4335,17694.0,141,1
4336,17660.0,141,1
4337,15623.0,141,1


In [104]:
#print cluster characteristics
ctm_dt.groupby('RecencyCluster')['Recency'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
RecencyCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,497.0,308.486922,39.020935,252.0,274.0,305.0,336.0,373.0
1,598.0,193.018395,31.587023,140.0,166.0,191.0,218.0,250.0
2,1012.0,84.609684,24.124447,53.0,64.0,78.0,103.0,138.0
3,2232.0,20.16129,14.640132,0.0,8.0,18.0,31.0,52.0


It can be seen from the above that cluster **3** covers the most recent customers whereas cluster **0** has the most inactive customers.


### Frequency

Customers purchase behaviour based on the number of times they buy from the online retail shop. 


In [105]:
#get order counts for each user and create a dataframe with it
frequency = df_data.groupby('CustomerID').InvoiceDate.count().reset_index()
frequency.columns = ['CustomerID','Frequency']

#add this data to main ctm_dt
ctm_dt = pd.merge(ctm_dt, frequency, on='CustomerID')

ctm_dt.head()

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency
0,17850.0,372,0,297
1,15100.0,333,0,3
2,18074.0,373,0,13
3,16250.0,261,0,24
4,13747.0,373,0,1


In [106]:
pd.DataFrame(ctm_dt.Frequency.describe())

Unnamed: 0,Frequency
count,4339.0
mean,90.5121
std,225.515328
min,1.0
25%,17.0
50%,41.0
75%,98.0
max,7676.0


In [107]:
#Plot histogram for Frequency
hist_fig = px.histogram(ctm_dt.query('Frequency < 1200'), 
                        x="Frequency",
                        nbins=30,
                        title="Customers with Purchase Frequency less than 1200", 
                        template="plotly_dark")

hist_fig.update_layout(title_x=0.5, 
                       xaxis_title="Customer Frequency of Purchasesin groups of 30 days", 
                       yaxis_title="Number of Customers",
                       xaxis=dict(
                           tickmode='linear',  
                           dtick=100            
                       ))

hist_fig.show(config={'displaylogo': False})

In [108]:
kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.fit(ctm_dt[['Frequency']])
ctm_dt['FrequencyCluster'] = kmeans.predict(ctm_dt[['Frequency']])

ctm_dt = order_cluster(ctm_dt, 'Frequency', 'FrequencyCluster', False)
ctm_dt.head()

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster
0,17850.0,372,0,297,2
1,15808.0,306,0,195,2
2,12583.0,2,3,247,2
3,14688.0,7,3,324,2
4,16029.0,38,3,241,2


In [109]:
#details of each cluster
ctm_dt.groupby('FrequencyCluster')['Frequency'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
FrequencyCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,4.0,5718.0,1403.25265,4413.0,4936.5,5391.5,6173.0,7676.0
1,24.0,1309.958333,496.289929,828.0,959.0,1120.0,1517.0,2677.0
2,479.0,318.206681,128.638481,184.0,218.0,276.0,381.5,785.0
3,3832.0,48.538622,43.315107,1.0,15.0,33.0,71.0,183.0


As it was for the case of the Recency, higher frequency number means better customers. But as one can observe, the most representative cluster (**3**) has the lowest mean frequency value. 

### Monetary

In [110]:
monetary = df_data.groupby('CustomerID').Total_cost.sum().reset_index().rename(columns={'Total_cost':'Monetary'})

ctm_dt = pd.merge(ctm_dt, monetary, on='CustomerID')
ctm_dt.head()

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Monetary
0,17850.0,372,0,297,2,5391.21
1,15808.0,306,0,195,2,3651.27
2,12583.0,2,3,247,2,7281.38
3,14688.0,7,3,324,2,5579.1
4,16029.0,38,3,241,2,80850.84


In [111]:
#plot histogram
hist_fig = px.histogram(x=ctm_dt.query('Monetary < 10000')['Monetary'],
                        title="Customers with Monetary Value below 10000", 
                        template= "plotly_dark" 
                       )

hist_fig.update_layout(title_x=0.5, 
                       xaxis_title="Customers Revenue", 
                       yaxis_title="Number of Customers"
                      )

hist_fig.show(config={'displaylogo': False})


In [112]:
pd.DataFrame(ctm_dt.Monetary.describe())

Unnamed: 0,Monetary
count,4339.0
mean,2048.215924
std,8984.248352
min,0.0
25%,306.455
50%,668.56
75%,1660.315
max,280206.02


In [113]:
#apply clustering
kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.fit(ctm_dt[['Monetary']])
ctm_dt['MonetaryCluster'] = kmeans.predict(ctm_dt[['Monetary']])

ctm_dt = order_cluster(ctm_dt, 'Monetary', 'MonetaryCluster', True)
ctm_dt.head()


Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Monetary,MonetaryCluster
0,17850.0,372,0,297,2,5391.21,0
1,15808.0,306,0,195,2,3651.27,0
2,12583.0,2,3,247,2,7281.38,0
3,14688.0,7,3,324,2,5579.1,0
4,12431.0,35,3,235,2,6419.95,0


In [114]:
#details by cluster
ctm_dt.groupby('MonetaryCluster')['Monetary'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
MonetaryCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,4301.0,1432.684084,2223.22179,0.0,304.56,658.64,1614.31,21429.39
1,31.0,46344.266452,17875.360382,25977.16,31870.25,40519.84,58636.28,91062.38
2,5.0,149739.814,31841.192074,117210.08,124914.53,143711.17,168472.5,194390.79
3,2.0,269931.66,14530.139257,259657.3,264794.48,269931.66,275068.84,280206.02


As highlighted before, the most representative cluster in this case (**0**) has the lowest mean monetary value.

### Overall Score



In [115]:
#calculate overall score and use mean() to see details
ctm_dt['OverallScore'] = ctm_dt['RecencyCluster'] + ctm_dt['FrequencyCluster'] + ctm_dt['MonetaryCluster']
ctm_dt.groupby('OverallScore')['Recency','Frequency','Monetary'].mean()

Unnamed: 0_level_0,Recency,Frequency,Monetary
OverallScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,339.0,246.0,4521.24
3,306.576,25.608,412.60364
4,181.01849,100.676425,1206.150495
5,63.546819,136.319514,2506.255999
6,21.908835,67.041643,1729.927384
7,8.7,381.9,88061.002
8,0.0,217.0,214064.9


The scoring above clearly shows us that customers with score **8** are our best customers whereas those who score **3** are the worst. With that the customers can be divided in three segments: 'Low-Value', 'Mid-Value' and 'High-Value'.


In [116]:
ctm_dt['Segment'] = 'Low-Value'
ctm_dt.loc[ctm_dt['OverallScore'] > 4, 'Segment'] = 'Mid-Value'
ctm_dt.loc[ctm_dt['OverallScore'] > 6, 'Segment'] = 'High-Value'

ctm_dt.head()

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Monetary,MonetaryCluster,OverallScore,Segment
0,17850.0,372,0,297,2,5391.21,0,2,Low-Value
1,15808.0,306,0,195,2,3651.27,0,2,Low-Value
2,12583.0,2,3,247,2,7281.38,0,5,Mid-Value
3,14688.0,7,3,324,2,5579.1,0,5,Mid-Value
4,12431.0,35,3,235,2,6419.95,0,5,Mid-Value


In [117]:
#A scatter plot of Monetary verses Frequency

plot_scatter(ctm_dt, 'Monetary', 'Frequency', segment_col='Segment')

In [118]:
#A scatter plot of Monetary versus Recency

plot_scatter(ctm_dt, 'Monetary', 'Recency', segment_col='Segment')

In [119]:
#A scatter plot of Recency verses Frequency

plot_scatter(ctm_dt, 'Recency', 'Frequency', segment_col='Segment')

Now, I'll create a copy of the dataframe ctm_dt and apply the method get_dummies to it so as to convert all categorical column Segment to indicator variables.

In [120]:
#create ctm_class as a copy of ctm_dt before applying get_dummies
ctm_class = ctm_dt.copy()
ctm_class = pd.get_dummies(ctm_class)
ctm_class.head()

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Monetary,MonetaryCluster,OverallScore,Segment_High-Value,Segment_Low-Value,Segment_Mid-Value
0,17850.0,372,0,297,2,5391.21,0,2,0,1,0
1,15808.0,306,0,195,2,3651.27,0,2,0,1,0
2,12583.0,2,3,247,2,7281.38,0,5,0,0,1
3,14688.0,7,3,324,2,5579.1,0,5,0,0,1
4,12431.0,35,3,235,2,6419.95,0,5,0,0,1


Since the goal is to estimate whether a customer will make a purchase in the next 30 days, I'll create a new column NextPurchaseDayRange with values as either 1 or 0 defined as follows:

* If the value is **1**, then it indicates that the customer will buy something in the next 30 days from his or her last purchase.
* The value **0** indicates that the customer will buy something in more than 30 days from his or her last purchase.

Before any further it is necessary to know what are the Next purchase days for the Customers in the dataset.

For that, I'll split the data into 2 sub dataframes:

* **df_main_period**: a DataFrame that excludes the last 30 days.
* **df_last_30_days**: a DataFrame with only last 30 days.


In [121]:
latest_date = df_data['InvoiceDate'].max()
cutoff_date = latest_date - pd.Timedelta(days=30)

# DataFrame que excluye los últimos 30 días
df_main_period = df_data[df_data['InvoiceDate'] < cutoff_date].reset_index(drop=True)

# DataFrame con solo los últimos 30 días
df_last_30_days = df_data[df_data['InvoiceDate'] >= cutoff_date].reset_index(drop=True)

In [122]:
#DataFrame with CustomerID and first purchase in last 30 days
ctm_1st_purchase_30_days = df_last_30_days.groupby('CustomerID').InvoiceDate.min().reset_index()
ctm_1st_purchase_30_days.columns = ['CustomerID', 'MinPurchaseDate']

#DataFrame with CustomerID and last purchase in main period
ctm_last_purchase_main_period = df_main_period.groupby('CustomerID').InvoiceDate.max().reset_index()
ctm_last_purchase_main_period.columns = ['CustomerID', 'MaxPurchaseDate']

ctm_purchase_dates = pd.merge(ctm_last_purchase_main_period, ctm_1st_purchase_30_days, on='CustomerID', how='left')

#Calculate days until next purchase
ctm_purchase_dates['NextPurchaseDay'] = (ctm_purchase_dates['MinPurchaseDate'] - ctm_purchase_dates['MaxPurchaseDate']).dt.days

#Fill NaN values
ctm_purchase_dates['NextPurchaseDay'] = ctm_purchase_dates['NextPurchaseDay'].fillna(-1)

ctm_class = pd.merge(ctm_class, ctm_purchase_dates[['CustomerID', 'NextPurchaseDay']], on='CustomerID', how='left')

ctm_class

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Monetary,MonetaryCluster,OverallScore,Segment_High-Value,Segment_Low-Value,Segment_Mid-Value,NextPurchaseDay
0,17850.0,372,0,297,2,5391.21,0,2,0,1,0,-1.0
1,15808.0,306,0,195,2,3651.27,0,2,0,1,0,-1.0
2,12583.0,2,3,247,2,7281.38,0,5,0,0,1,12.0
3,14688.0,7,3,324,2,5579.10,0,5,0,0,1,32.0
4,12431.0,35,3,235,2,6419.95,0,5,0,0,1,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4334,16446.0,0,3,3,3,168472.50,2,8,1,0,0,204.0
4335,14156.0,9,3,1395,1,117210.08,2,6,0,0,1,3.0
4336,14911.0,1,3,5672,0,143711.17,2,5,0,0,1,1.0
4337,18102.0,0,3,431,2,259657.30,3,8,1,0,0,24.0


In [123]:
ctm_class['NextPurchaseDayRange'] = 1  ## less than 30 days
ctm_class.loc[ctm_class.NextPurchaseDay>30,'NextPurchaseDayRange'] = 0 # more than 30 days
ctm_class.head()

Unnamed: 0,CustomerID,Recency,RecencyCluster,Frequency,FrequencyCluster,Monetary,MonetaryCluster,OverallScore,Segment_High-Value,Segment_Low-Value,Segment_Mid-Value,NextPurchaseDay,NextPurchaseDayRange
0,17850.0,372,0,297,2,5391.21,0,2,0,1,0,-1.0,1
1,15808.0,306,0,195,2,3651.27,0,2,0,1,0,-1.0,1
2,12583.0,2,3,247,2,7281.38,0,5,0,0,1,12.0,1
3,14688.0,7,3,324,2,5579.1,0,5,0,0,1,32.0,0
4,12431.0,35,3,235,2,6419.95,0,5,0,0,1,-1.0,1


## Machine Learning Models

I'll use the ctm_class dataframe to evaluate different ML models and see their metrics.

In [124]:
ctm_class = ctm_class.drop('NextPurchaseDay', axis=1)

# Splitting into train and test 
X, y = ctm_class.drop('NextPurchaseDayRange', axis=1), ctm_class.NextPurchaseDayRange
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, shuffle=True)

In [125]:
# Create an array of models
models = []
models.append(("LogisticRegression", LogisticRegression()))
models.append(("GaussianNB", GaussianNB()))
models.append(("RandomForestClassifier", RandomForestClassifier()))
models.append(("SVC", SVC()))
models.append(("DecisionTreeClassifier", DecisionTreeClassifier()))
models.append(("xgb.XGBClassifier", xgb.XGBClassifier(eval_metric='mlogloss')))
models.append(("KNeighborsClassifier", KNeighborsClassifier()))

In [126]:
# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_macro',
    'recall': 'recall_macro',
    'precision': 'precision_macro'
}

# Dictionary for model scores
model_scores_dict = {
    'model_name': [], 
    'accuracy': [], 
    'f1_score': [], 
    'recall': [], 
    'precision': [], 
    'time': []
}

# Iterate over each model
for model_name, model in models:
    model_scores_dict['model_name'].append(model_name)
    
    # Initialize KFold
    kfold = KFold(n_splits=2, random_state=24, shuffle=True)
    
    start = time.time()
    
    # Evaluate each model using cross-validation
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=None)  # scoring=None here to get raw results
    
    # Collect and calculate metrics
    accuracy_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    f1_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1_macro')
    recall_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='recall_macro')
    precision_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='precision_macro')
    
    # Update dictionary with metrics
    model_scores_dict['accuracy'].append(np.mean(accuracy_scores))
    model_scores_dict['f1_score'].append(np.mean(f1_scores))
    model_scores_dict['recall'].append(np.mean(recall_scores))
    model_scores_dict['precision'].append(np.mean(precision_scores))
    
    model_scores_dict['time'].append(time.time() - start)

# Create DataFrame from the dictionary and sort
model_score_df = pd.DataFrame(model_scores_dict).set_index('model_name')
model_score_df.sort_values(by=['accuracy', 'f1_score', 'time'], ascending=False)

Unnamed: 0_level_0,accuracy,f1_score,recall,precision,time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.858832,0.780284,0.786454,0.773106,3.585695
xgb.XGBClassifier,0.850187,0.764371,0.762159,0.766854,1.066761
DecisionTreeClassifier,0.83434,0.730786,0.736163,0.737086,0.122735
LogisticRegression,0.814465,0.697471,0.689271,0.709366,0.296246
SVC,0.799194,0.444194,0.5,0.399597,2.919875
KNeighborsClassifier,0.772688,0.576325,0.569537,0.606585,0.804352
GaussianNB,0.734371,0.699536,0.826737,0.711243,0.087628


Given that the best metrics, specially in f1_score and precision, are for the Random Forest Classifier, thi will be the model implemented. 

In [132]:
#features = ['Recency', 'Frequency', 'Monetary', 'RecencyCluster', 'FrequencyCluster', 'MonetaryCluster']
#X = ctm_class[features]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8133640552995391
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.49      0.52       179
           1       0.87      0.90      0.88       689

    accuracy                           0.81       868
   macro avg       0.71      0.69      0.70       868
weighted avg       0.81      0.81      0.81       868



In [137]:
# Initialize an XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': randint(50, 300),           # Number of trees
    'learning_rate': uniform(0.01, 0.2),        # Learning rate
    'max_depth': randint(3, 10),                # Maximum depth of a tree
    'subsample': uniform(0.5, 0.5),             # Subsample ratio of the training instance
    'colsample_bytree': uniform(0.5, 0.5),      # Subsample ratio of columns when constructing each tree
    'gamma': uniform(0, 0.5),                   # Minimum loss reduction required to make a further partition
}

# Set up RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    xgb_clf, 
    param_distributions=param_dist, 
    n_iter=50, 
    scoring='accuracy', 
    cv=3, 
    verbose=1, 
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

# Best parameters found
print(f"Best parameters: {random_search.best_params_}")

# Predict on the test set
y_pred = random_search.best_estimator_.predict(X_test)

# Print classification report and accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'colsample_bytree': 0.8049983288913105, 'gamma': 0.41659745586808217, 'learning_rate': 0.04467293070155442, 'max_depth': 3, 'n_estimators': 213, 'subsample': 0.5911180438940311}
Accuracy: 0.8502304147465438
              precision    recall  f1-score   support

           0       0.61      0.77      0.68       179
           1       0.94      0.87      0.90       689

    accuracy                           0.85       868
   macro avg       0.77      0.82      0.79       868
weighted avg       0.87      0.85      0.86       868

