In [2]:
!pip install scikit-learn==1.0.2 statsmodels yellowbrick python-slugify sagemaker==2.88.0 s3fs 

Collecting sagemaker==2.88.0
  Downloading sagemaker-2.88.0.tar.gz (527 kB)
[K     |████████████████████████████████| 527 kB 4.1 MB/s 
[?25hCollecting s3fs
  Downloading s3fs-2022.3.0-py3-none-any.whl (26 kB)
Collecting attrs==20.3.0
  Downloading attrs-20.3.0-py2.py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 6.0 MB/s 
[?25hCollecting boto3>=1.20.21
  Downloading boto3-1.22.9-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 67.7 MB/s 
Collecting protobuf3-to-dict>=0.1.5
  Downloading protobuf3-to-dict-0.1.5.tar.gz (3.5 kB)
Collecting smdebug_rulesconfig==1.0.1
  Downloading smdebug_rulesconfig-1.0.1-py2.py3-none-any.whl (20 kB)
Collecting pathos
  Downloading pathos-0.2.8-py2.py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 10.7 MB/s 
[?25hCollecting botocore<1.26.0,>=1.25.9
  Downloading botocore-1.25.9-py3-none-any.whl (8.7 MB)
[K     |████████████████████████████████| 8.7 MB 60.7 MB/s 
[?25hCollect

# Data cleaning and Feature engineering

In [32]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
from slugify import slugify
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import datetime as dt

In [33]:
churn_data = pd.read_csv("/content/telco-customer-churn.csv")

In [34]:
churn_data['TotalCharges'] = churn_data["TotalCharges"].replace(" ",np.nan)

churn_data = churn_data[churn_data["TotalCharges"].notnull()]
churn_data = churn_data.reset_index()[churn_data.columns]

churn_data["TotalCharges"] = churn_data["TotalCharges"].astype(float)
    
def tenure_label(churn_data) :
    if churn_data["tenure"] <= 24 :
        return "0-24"
    elif (churn_data["tenure"] > 24) & (churn_data["tenure"] <= 48) :
        return "24-48"
    elif churn_data["tenure"] > 48:
        return "48-end"
churn_data["tenure_group"] = churn_data.apply(lambda churn_data:tenure_label(churn_data),
                                      axis = 1)

replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    churn_data[i]  = churn_data[i].replace({'No internet service' : 'No'})


In [6]:
churn_data.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
4621,6112-KTHFQ,Female,0,No,No,13,Yes,No,No,No,...,No,No,No,Month-to-month,Yes,Mailed check,19.3,279.3,No,0-24
4401,5227-JSCFE,Male,1,Yes,No,71,No,No phone service,DSL,Yes,...,Yes,No,No,Two year,No,Credit card (automatic),46.35,3353.4,No,48-end
3742,6339-RZCBJ,Male,0,No,No,48,Yes,No,DSL,No,...,Yes,Yes,Yes,Two year,No,Credit card (automatic),78.9,3771.5,No,24-48
2088,5555-RNPGT,Male,0,No,Yes,10,Yes,Yes,No,No,...,No,No,No,Month-to-month,No,Credit card (automatic),24.8,223.9,No,0-24
1054,1624-WOIWJ,Female,0,No,No,10,Yes,No,Fiber optic,Yes,...,No,Yes,No,Month-to-month,Yes,Mailed check,84.7,832.05,Yes,0-24


In [35]:
churn_data.nunique()

customerID          7032
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         2
OnlineBackup           2
DeviceProtection       2
TechSupport            2
StreamingTV            2
StreamingMovies        2
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1584
TotalCharges        6530
Churn                  2
tenure_group           3
dtype: int64

In [36]:
bin_cols   = churn_data.nunique()[churn_data.nunique() == 2].keys().tolist()

le = LabelEncoder()
for i in bin_cols :
    churn_data[i] = le.fit_transform(churn_data[i])

In [37]:
all_categorical_cols = churn_data.nunique()[churn_data.nunique() <=4].keys().tolist()
multi_value_cols = [col for col in all_categorical_cols if col not in bin_cols]
churn_data = pd.get_dummies(data = churn_data, columns=multi_value_cols)

In [38]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
std = StandardScaler()
churn_data[numerical_cols] = std.fit_transform(churn_data[numerical_cols])

In [39]:
churn_data.columns = [slugify(col, lowercase=True, separator='_') for col in churn_data.columns]
sample = churn_data.head()

In [11]:
sample[['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'onlinesecurity', 'onlinebackup',
       'deviceprotection', 'techsupport', 'streamingtv']]

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv
0,7590-VHVEG,0,0,1,0,-1.280248,0,0,1,0,0,0
1,5575-GNVDE,1,0,0,0,0.064303,1,1,0,1,0,0
2,3668-QPYBK,1,0,0,0,-1.239504,1,1,1,0,0,0
3,7795-CFOCW,1,0,0,0,0.512486,0,1,0,1,1,0
4,9237-HQITU,0,0,0,0,-1.239504,1,0,0,0,0,0


In [12]:
sample[['streamingmovies', 'paperlessbilling', 'monthlycharges', 'totalcharges', 'churn',
       'multiplelines_no', 'multiplelines_no_phone_service',
       'multiplelines_yes', 'internetservice_dsl']]

Unnamed: 0,streamingmovies,paperlessbilling,monthlycharges,totalcharges,churn,multiplelines_no,multiplelines_no_phone_service,multiplelines_yes,internetservice_dsl
0,0,1,-1.161694,-0.994194,0,0,1,0,1
1,0,0,-0.260878,-0.17374,0,1,0,0,1
2,0,1,-0.363923,-0.959649,1,1,0,0,1
3,0,0,-0.74785,-0.195248,0,0,1,0,1
4,0,1,0.196178,-0.940457,1,1,0,0,0


In [13]:
sample[['internetservice_fiber_optic','internetservice_no',
       'contract_month_to_month', 'contract_one_year', 'contract_two_year',
       'paymentmethod_bank_transfer_automatic']]

Unnamed: 0,internetservice_fiber_optic,internetservice_no,contract_month_to_month,contract_one_year,contract_two_year,paymentmethod_bank_transfer_automatic
0,0,0,1,0,0,0
1,0,0,0,1,0,0
2,0,0,1,0,0,0
3,0,0,0,1,0,1
4,1,0,1,0,0,0


In [14]:
sample[['paymentmethod_credit_card_automatic','paymentmethod_electronic_check',
       'paymentmethod_mailed_check', 'tenure_group_0_24', 'tenure_group_24_48',
       'tenure_group_48_end']]

Unnamed: 0,paymentmethod_credit_card_automatic,paymentmethod_electronic_check,paymentmethod_mailed_check,tenure_group_0_24,tenure_group_24_48,tenure_group_48_end
0,0,1,0,1,0,0
1,0,0,1,0,1,0
2,0,0,1,1,0,0
3,0,0,0,0,1,0
4,0,1,0,1,0,0


# Featue group creation and ingestion

In [42]:
# import os
# os.environ["AWS_ACCESS_KEY_ID"] = "<aws_key_id>"
# os.environ["AWS_SECRET_ACCESS_KEY"] = "<aws_secret>"
# os.environ["AWS_DEFAULT_REGION"] = "us-east-1"

In [59]:
import boto3
FEATURE_GROUP_NAME = "telcom-customer-features"
feature_group_exist = False
client = boto3.client('sagemaker')
response = client.list_feature_groups(
    NameContains=FEATURE_GROUP_NAME)
if FEATURE_GROUP_NAME in response["FeatureGroupSummaries"]:
  feature_group_exist = True

In [43]:
import sagemaker
from sagemaker.session import Session
import time
role = "arn:aws:iam::<account_number>:role/sagemaker-iam-role"

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
s3_bucket_name = "feast-demo-mar-2022"

In [55]:
from sagemaker.feature_store.feature_group import FeatureGroup

customers_feature_group = FeatureGroup(
    name=FEATURE_GROUP_NAME, sagemaker_session=sagemaker_session
)

In [44]:
churn_data["event_timestamp"] = float(round(time.time()))

In [57]:
if not feature_group_exist:
  customers_feature_group.load_feature_definitions(
      churn_data[[col 
                  for col in churn_data.columns 
                  if col not in ["customerid"]]]) 
  customer_id_def = FeatureDefinition(feature_name='customerid', 
                                      feature_type=FeatureTypeEnum.STRING)
  customers_feature_group.feature_definitions = [customer_id_def] + customers_feature_group.feature_definitions
  customers_feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{FEATURE_GROUP_NAME}",
    record_identifier_name="customerid",
    event_time_feature_name="event_timestamp",
    role_arn=role,
    enable_online_store=False
    )


In [None]:
ingestion_results = customers_feature_group.ingest(churn_data, max_workers=1)
ingestion_results.failed_rows