In [1]:
!pip install feast[aws]

Collecting feast[aws]
  Downloading feast-0.19.3-py3-none-any.whl (289 kB)
[K     |████████████████████████████████| 289 kB 3.9 MB/s 
[?25hCollecting uvicorn[standard]>=0.14.0
  Downloading uvicorn-0.17.6-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.0 MB/s 
[?25hCollecting pydantic>=1.0.0
  Downloading pydantic-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[K     |████████████████████████████████| 10.9 MB 64.7 MB/s 
[?25hCollecting PyYAML>=5.4.*
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.9 MB/s 
Collecting toml==0.10.*
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting googleapis-common-protos==1.52.*
  Downloading googleapis_common_protos-1.52.0-py2.py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 9.2 MB/s 
[?25hCollecting grpcio-reflection>

In [2]:
import pandas as pd
from datetime import datetime, timedelta, date
from sklearn.cluster import KMeans

##Read the data and filter out data that belongs to country other than UK
retail_data = pd.read_csv('/content/OnlineRetail.csv', encoding= 'unicode_escape')
retail_data['InvoiceDate'] = pd.to_datetime(retail_data['InvoiceDate'], errors = 'coerce')
uk_data = retail_data.query("Country=='United Kingdom'").reset_index(drop=True)


In [3]:
## Create 3months and 6 months data frames
t1 = pd.Timestamp("2011-06-01 00:00:00.054000")
t2 = pd.Timestamp("2011-03-01 00:00:00.054000")
t3 = pd.Timestamp("2011-12-01 00:00:00.054000")
uk_data_3m = uk_data[(uk_data.InvoiceDate < t1) & (uk_data.InvoiceDate >= t2)].reset_index(drop=True)
uk_data_6m = uk_data[(uk_data.InvoiceDate >= t1) & (uk_data.InvoiceDate < t3)].reset_index(drop=True)


In [4]:
## Calculate RFM values.
uk_data_3m['revenue'] = uk_data_3m['UnitPrice'] * uk_data_3m['Quantity']
max_date = uk_data_3m['InvoiceDate'].max() + timedelta(days=1)
rfm_data = uk_data_3m.groupby(['CustomerID']).agg({
        'InvoiceDate': lambda x: (max_date - x.max()).days,
        'InvoiceNo': 'count',
        'revenue': 'sum'})
rfm_data.rename(columns={'InvoiceDate': 'Recency',
                         'InvoiceNo': 'Frequency',
                         'revenue': 'MonetaryValue'}, inplace=True)


In [5]:
## Calculate RFM groups of customers 
r_grp = pd.qcut(rfm_data['Recency'], q=4, labels=range(3,-1,-1))
f_grp = pd.qcut(rfm_data['Frequency'], q=4, labels=range(0,4))
m_grp = pd.qcut(rfm_data['MonetaryValue'], q=4, labels=range(0,4))
rfm_data = rfm_data.assign(R=r_grp.values).assign(F=f_grp.values).assign(M=m_grp.values)
rfm_data['R'] = rfm_data['R'].astype(int)
rfm_data['F'] = rfm_data['F'].astype(int)
rfm_data['M'] = rfm_data['M'].astype(int)
rfm_data['RFMScore'] = rfm_data['R'] + rfm_data['F'] + rfm_data['M']

In [6]:
# segment customers.
rfm_data['Segment'] = 'Low-Value'
rfm_data.loc[rfm_data['RFMScore']>4,'Segment'] = 'Mid-Value' 
rfm_data.loc[rfm_data['RFMScore']>6,'Segment'] = 'High-Value' 
rfm_data = rfm_data.reset_index()

In [7]:
# Calculate revenue using the six month dataframe.
uk_data_6m['revenue'] = uk_data_6m['UnitPrice'] * uk_data_6m['Quantity']
revenue_6m = uk_data_6m.groupby(['CustomerID']).agg({
        'revenue': 'sum'})
revenue_6m.rename(columns={'revenue': 'Revenue_6m'}, inplace=True)
revenue_6m = revenue_6m.reset_index()

In [9]:
# Merge the 6m revenue data frame with RFM data.
merged_data = pd.merge(rfm_data, revenue_6m, how="left")
merged_data.fillna(0)

Unnamed: 0,CustomerID,Recency,Frequency,MonetaryValue,R,F,M,RFMScore,Segment,Revenue_6m
0,12747.0,7,35,1082.09,3,2,3,8,High-Value,1666.11
1,12748.0,1,582,4336.73,3,3,3,9,High-Value,18679.01
2,12749.0,8,54,782.10,3,3,3,9,High-Value,2323.04
3,12821.0,23,6,92.72,2,0,0,2,Low-Value,0.00
4,12823.0,63,1,459.00,0,0,2,2,Low-Value,765.00
...,...,...,...,...,...,...,...,...,...,...
1835,18272.0,21,59,966.74,2,3,3,8,High-Value,1730.16
1836,18273.0,66,1,51.00,0,0,0,0,Low-Value,102.00
1837,18280.0,86,10,180.60,0,0,0,0,Low-Value,0.00
1838,18283.0,9,100,217.15,3,3,1,7,High-Value,1351.83


In [10]:
# Create LTV cluster groups
merged_data = merged_data[merged_data['Revenue_6m']<merged_data['Revenue_6m'].quantile(0.99)]
kmeans = KMeans(n_clusters=3)
kmeans.fit(merged_data[['Revenue_6m']])
merged_data['LTVCluster'] = kmeans.predict(merged_data[['Revenue_6m']])

In [11]:
feature_data = pd.get_dummies(merged_data)
feature_data.head(5)

Unnamed: 0,CustomerID,Recency,Frequency,MonetaryValue,R,F,M,RFMScore,Revenue_6m,LTVCluster,Segment_High-Value,Segment_Low-Value,Segment_Mid-Value
0,12747.0,7,35,1082.09,3,2,3,8,1666.11,0,1,0,0
1,12748.0,1,582,4336.73,3,3,3,9,18679.01,1,1,0,0
2,12749.0,8,54,782.1,3,3,3,9,2323.04,0,1,0,0
4,12823.0,63,1,459.0,0,0,2,2,765.0,0,0,1,0
7,12836.0,28,62,814.71,1,3,3,7,951.46,0,1,0,0


In [13]:
feature_data.columns = ['CustomerID', 'Recency', 'Frequency', 'MonetaryValue', 'R', 'F', 'M',
       'RFMScore', 'Revenue6m', 'LTVCluster', 'SegmentHighValue',
       'SegmentLowValue', 'SegmentMidValue']

In [22]:
feature_data.head(5)

Unnamed: 0,CustomerID,Recency,Frequency,MonetaryValue,R,F,M,RFMScore,Revenue6m,LTVCluster,SegmentHighValue,SegmentLowValue,SegmentMidValue,event_timestamp,created_timestamp
0,12747.0,7,35,1082.09,3,2,3,8,1666.11,0,1,0,0,2022-03-14 03:44:04.961602,2022-03-14 03:44:04.963311
1,12748.0,1,582,4336.73,3,3,3,9,18679.01,1,1,0,0,2022-03-14 03:44:04.961602,2022-03-14 03:44:04.963311
2,12749.0,8,54,782.1,3,3,3,9,2323.04,0,1,0,0,2022-03-14 03:44:04.961602,2022-03-14 03:44:04.963311
4,12823.0,63,1,459.0,0,0,2,2,765.0,0,0,1,0,2022-03-14 03:44:04.961602,2022-03-14 03:44:04.963311
7,12836.0,28,62,814.71,1,3,3,7,951.46,0,1,0,0,2022-03-14 03:44:04.961602,2022-03-14 03:44:04.963311


In [1]:
import os
os.environ["AWS_ACCESS_KEY_ID"] = "<aws_key>"
os.environ["AWS_SECRET_ACCESS_KEY"] = "<aws_secret>"
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"

In [21]:
from datetime import datetime
file_name = f"rfm_features-{datetime.now()}.parquet" 
feature_data["event_timestamp"] = datetime.now()
feature_data["created_timestamp"] = datetime.now()
feature_data['CustomerID'] = feature_data['CustomerID'].astype(str)

In [24]:
!pip install s3fs

Collecting s3fs
  Downloading s3fs-2022.2.0-py3-none-any.whl (26 kB)
Collecting aiobotocore~=2.1.0
  Downloading aiobotocore-2.1.2.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 2.0 MB/s 
Collecting aiohttp<=4
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 6.5 MB/s 
[?25hCollecting botocore<1.23.25,>=1.23.24
  Downloading botocore-1.23.24-py3-none-any.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 35.2 MB/s 
Collecting aioitertools>=0.5.1
  Downloading aioitertools-0.10.0-py3-none-any.whl (23 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)
[K     |████████████████████████████████| 94 kB 2.9 MB/s 
[?25hCollecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting frozenlist>=1.1.1
  Downl

In [30]:

s3_url = f's3://feast-demo-mar-2022/customer-rfm-features/{file_name}'
feature_data.to_parquet(s3_url)