In [None]:
!pip install feast[aws] pandas xgboost

Collecting feast[aws]
  Downloading feast-0.19.3-py3-none-any.whl (289 kB)
[K     |████████████████████████████████| 289 kB 5.3 MB/s 
Collecting fastapi>=0.68.0
  Downloading fastapi-0.75.0-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 3.0 MB/s 
Collecting googleapis-common-protos==1.52.*
  Downloading googleapis_common_protos-1.52.0-py2.py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 10.2 MB/s 
[?25hCollecting toml==0.10.*
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting dask<2022.02.0,>=2021.*
  Downloading dask-2022.1.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 47.8 MB/s 
Collecting PyYAML>=5.4.*
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.6 MB/s 
[?25hCollecting pandavro==1.5.*
  Downloading pandavro-1.5.2.tar.gz (3.8 kB)
Collecting fas

In [None]:
import pandas as pd
# from datetime import datetime, timedelta

##Read the data and filter out data that belongs to country other than UK
retail_data = pd.read_csv('/content/OnlineRetail.csv', encoding= 'unicode_escape')
retail_data['InvoiceDate'] = pd.to_datetime(retail_data['InvoiceDate'], errors = 'coerce')
uk_data = retail_data.query("Country=='United Kingdom'").reset_index(drop=True)

In [None]:
entity_df = uk_data[["CustomerID"]]
len(entity_df.CustomerID.unique())

3951

In [None]:
t1 = pd.Timestamp("2011-06-01 00:00:00.054000")
t2 = pd.Timestamp("2011-03-01 00:00:00.054000")
uk_data_3m = uk_data[(uk_data.InvoiceDate < t1) & (uk_data.InvoiceDate >= t2)].reset_index(drop=True)
entity_df = uk_data_3m[["CustomerID"]]
len(entity_df.CustomerID.unique())

1841

In [None]:
from datetime import datetime
entity_df = pd.DataFrame(data = {
    "CustomerID": [str(item) for item in uk_data_3m.CustomerID.unique().tolist()],
    "event_timestamp": datetime.now()
})
entity_df.head()

Unnamed: 0,CustomerID,event_timestamp
0,14620.0,2022-03-27 01:14:32.527633
1,14740.0,2022-03-27 01:14:32.527633
2,13880.0,2022-03-27 01:14:32.527633
3,16462.0,2022-03-27 01:14:32.527633
4,17068.0,2022-03-27 01:14:32.527633


In [None]:
%cd customer_segmentation

#import feast and load feature store object with the path to the directory which contains feature_story.yaml.
from feast import FeatureStore
store = FeatureStore(repo_path=".")
for entity in store.list_entities():
  print(f"entity: {entity}")

/content/customer_segmentation
Feast is an open source project that collects anonymized error reporting and usage statistics. To opt out or learn more see https://docs.feast.dev/reference/usage


  from numpy.dual import register_func
  supported_dtypes = [np.typeDict[x] for x in supported_dtypes]


entity: {
  "spec": {
    "name": "customer",
    "valueType": "STRING",
    "description": "Id of the customer",
    "joinKey": "CustomerID"
  },
  "meta": {
    "createdTimestamp": "2022-03-14T23:41:33.070471Z",
    "lastUpdatedTimestamp": "2022-03-14T23:41:33.070471Z"
  }
}


In [None]:
feature_view = store.get_feature_view("customer_rfm_features")
print(feature_view.to_proto())

spec {
  name: "customer_rfm_features"
  entities: "customer"
  features {
    name: "Recency"
    value_type: INT32
  }
  features {
    name: "Frequency"
    value_type: INT32
  }
  features {
    name: "MonetaryValue"
    value_type: DOUBLE
  }
  features {
    name: "R"
    value_type: INT32
  }
  features {
    name: "F"
    value_type: INT32
  }
  features {
    name: "M"
    value_type: INT32
  }
  features {
    name: "RFMScore"
    value_type: INT32
  }
  features {
    name: "Revenue6m"
    value_type: DOUBLE
  }
  features {
    name: "LTVCluster"
    value_type: INT32
  }
  features {
    name: "SegmentHighValue"
    value_type: INT32
  }
  features {
    name: "SegmentLowValue"
    value_type: INT32
  }
  features {
    name: "SegmentMidValue"
    value_type: INT32
  }
  ttl {
    seconds: 315360000
  }
  batch_source {
    type: BATCH_REDSHIFT
    event_timestamp_column: "event_timestamp"
    created_timestamp_column: "created_timestamp"
    redshift_options {
      query



In [None]:
import os
from datetime import datetime
os.environ["AWS_ACCESS_KEY_ID"] = "<aws_key>"
os.environ["AWS_SECRET_ACCESS_KEY"] = "<aws_secret>"
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
job = store.get_historical_features(
    entity_df=entity_df,
    features=[
              "customer_rfm_features:recency", 
              "customer_rfm_features:MonetaryValue", 
              "customer_rfm_features:R", 
              "customer_rfm_features:M",
              "customer_rfm_features:RFMScore",
              "customer_rfm_features:SegmentHighValue",
              "customer_rfm_features:SegmentLowValue",
              "customer_rfm_features:SegmentMidValue",
              "customer_rfm_features:LTVCluster"
              ]
    )
feature_data = job.to_df()
feature_data = feature_data.dropna()
feature_data.head()



Unnamed: 0,customerid,event_timestamp,recency,monetaryvalue,r,m,rfmscore,segmenthighvalue,segmentlowvalue,segmentmidvalue,ltvcluster
0,12747.0,2022-03-27 01:14:32.527633,7.0,1082.09,3.0,3.0,8.0,1.0,0.0,0.0,0.0
2,12841.0,2022-03-27 01:14:32.527633,31.0,548.34,1.0,2.0,6.0,0.0,0.0,1.0,0.0
3,12849.0,2022-03-27 01:14:32.527633,77.0,704.5,0.0,2.0,4.0,0.0,1.0,0.0,0.0
4,12854.0,2022-03-27 01:14:32.527633,50.0,401.82,1.0,2.0,5.0,0.0,0.0,1.0,0.0
5,12863.0,2022-03-27 01:14:32.527633,7.0,-17.0,3.0,0.0,3.0,0.0,1.0,0.0,0.0


In [None]:
from sklearn.metrics import classification_report,confusion_matrix
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# feature_data = pd.get_dummies(merged_data)
X = feature_data.drop(['ltvcluster', 'customerid', 'event_timestamp'], axis=1)
y = feature_data['ltvcluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
xgb_classifier = xgb.XGBClassifier(max_depth=5, objective='multi:softprob')
xgb_model = xgb_classifier.fit(X_train, y_train)
acc = xgb_model.score(X_test,y_test)
print(f"Model accuracy: {acc}")

Model accuracy: 0.9202898550724637


In [None]:
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96       117
         1.0       0.50      0.25      0.33         4
         2.0       0.83      0.59      0.69        17

    accuracy                           0.92       138
   macro avg       0.76      0.61      0.66       138
weighted avg       0.91      0.92      0.91       138



In [None]:
!pip install joblib



In [None]:
import joblib
joblib.dump(xgb_model, '/content/customer_segment-v0.0')

['/content/customer_segment-v0.0']

In [None]:
loaded_model = joblib.load('/content/customer_segment-v0.0')
prediction = loaded_model.predict(X_test.head())
prediction.tolist()

[0.0, 0.0, 0.0, 2.0, 0.0]

In [None]:
import boto3
s3_client = boto3.client('s3')
response = s3_client.upload_file('/content/customer_segment-v0.0', "feast-demo-mar-2022", "model-repo/customer_segment-v0.0")

None
