In [1]:
# META DATA - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

    # Developer details: 
        # Name: Harish S
        # Role: Architect
        # Code ownership rights: Harish S
    # Version:
        # Version: V 1.0 (August 29th )
            # Developer: Harish S
     
    # Description: The code enables to explore features of feature tools framework
    
# CODE - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Dependency: 
    # Environment:     
        #Python 3.12

# Introduction to Feature tools

Feature tools is an automated feature engineering framework applied on complex datasets to extract meaningful features to apply on machine learning models 

Feature tools applies a series of transformations and aggregations to generate features by Deep Feature Synthesis technique 

Feature tools link : https://www.featuretools.com/

Installation procedure : https://featuretools.alteryx.com/en/stable/install.html 

Documentation : https://featuretools.alteryx.com/en/stable/guides/guides_index.html 


In [2]:
# -------------------------- Code starts here ------------------------- 

1. Import Foundational Libraries 

In [3]:
import featuretools as ft #import feature tools for running featuring engineering on data 
import pandas as pd #import pandas for data manipulation 
import numpy as np #numerical computation

2. Import other libraries

In [4]:
import os # os library for system related operations
import warnings # warnings library to control warning messages
warnings.filterwarnings("ignore")

3. Read the dataframes

In [5]:
# Input the path where we have saved the dataframes 
datapath=input("enter tha path of data folder ")

    a) Customer data

In [7]:
cx_df=pd.read_csv(datapath+'olist_customers_dataset.csv')

    b) Payment data 

In [8]:
pay_df=pd.read_csv(datapath+'olist_order_payments_dataset.csv')

    c) Orders data

In [9]:
ord_df=pd.read_csv(datapath+'olist_orders_dataset.csv')

4. Create an entity set which is a feature from feature tools library , basically it defines the relationship between dataframes

In [10]:
es=ft.EntitySet(id='cx_ord_pay')

5. Add the first dataframe to the entity set

In [11]:
es.add_dataframe(dataframe_name='cxdata',dataframe=cx_df,index='customer_id')

Entityset: cx_ord_pay
  DataFrames:
    cxdata [Rows: 99441, Columns: 5]
  Relationships:
    No relationships

6. As order_id is not unique , we are supposed to create a dummy index column 


In [12]:
pay_df["pay_index"] = pay_df.index
es = es.add_dataframe(dataframe_name="paydata", dataframe=pay_df, index="pay_index")

7. Add third dataframe to entity set

In [13]:
es.add_dataframe(dataframe_name='orderdata',dataframe=ord_df,index='order_id')

Entityset: cx_ord_pay
  DataFrames:
    cxdata [Rows: 99441, Columns: 5]
    paydata [Rows: 103886, Columns: 6]
    orderdata [Rows: 99441, Columns: 8]
  Relationships:
    No relationships

8. Define Patent , child dataframes and its indices

we link two dataframes which has common columns for eg customer id / order id

In [14]:
cx_order = ft.Relationship(
    entityset=es,
    parent_dataframe_name="cxdata", # consider custumer df as parent
    parent_column_name="customer_id", 
    child_dataframe_name="orderdata",
    child_column_name="customer_id",#customer id is common between customer and order dataframe
)

In [15]:
cx_pay=ft.Relationship(
    entityset=es,
    parent_dataframe_name="orderdata", # consider order df as parent 
    parent_column_name="order_id",
    child_dataframe_name="paydata",
    child_column_name="order_id",# order id is common between order and payment dataframe ,there by a relationship is formed between 3 dataframes 
)

9. Add Relationship between dataframe

In [16]:
es.add_relationship(relationship=cx_order) #add relationship to entity

Entityset: cx_ord_pay
  DataFrames:
    cxdata [Rows: 99441, Columns: 5]
    paydata [Rows: 103886, Columns: 6]
    orderdata [Rows: 99441, Columns: 8]
  Relationships:
    orderdata.customer_id -> cxdata.customer_id

In [17]:
es.add_relationship(relationship=cx_pay) #add relationship to entity

Entityset: cx_ord_pay
  DataFrames:
    cxdata [Rows: 99441, Columns: 5]
    paydata [Rows: 103886, Columns: 6]
    orderdata [Rows: 99441, Columns: 8]
  Relationships:
    orderdata.customer_id -> cxdata.customer_id
    paydata.order_id -> orderdata.order_id

10. RUN Deep Feature Synthesis to produce new columns

- Once DFS is run, we get feature matrix  and feature definition
- Feature Matrix can be used to run ML models 

In [21]:
# Generate features by running DFS
feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name="paydata",agg_primitives=["mean", "sum", "count"], 
    trans_primitives=["month", "day"])


In [23]:
feature_matrix.head() # visualise feature matrix generated by DFS

Unnamed: 0_level_0,payment_sequential,payment_type,payment_installments,payment_value,orderdata.order_status,orderdata.COUNT(paydata),orderdata.MEAN(paydata.payment_installments),orderdata.MEAN(paydata.payment_sequential),orderdata.MEAN(paydata.payment_value),orderdata.SUM(paydata.payment_installments),...,orderdata.DAY(order_estimated_delivery_date),orderdata.DAY(order_purchase_timestamp),orderdata.MONTH(order_approved_at),orderdata.MONTH(order_delivered_carrier_date),orderdata.MONTH(order_delivered_customer_date),orderdata.MONTH(order_estimated_delivery_date),orderdata.MONTH(order_purchase_timestamp),orderdata.cxdata.customer_zip_code_prefix,orderdata.cxdata.customer_city,orderdata.cxdata.customer_state
pay_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,credit_card,8,99.33,delivered,1,8.0,1.0,99.33,8.0,...,22,25,4,5,5,5,4,39801,teofilo otoni,MG
1,1,credit_card,1,24.39,delivered,1,1.0,1.0,24.39,1.0,...,16,26,6,6,6,7,6,2422,sao paulo,SP
2,1,credit_card,1,65.71,delivered,1,1.0,1.0,65.71,1.0,...,4,12,12,12,12,1,12,2652,sao paulo,SP
3,1,credit_card,8,107.78,delivered,1,8.0,1.0,107.78,8.0,...,4,6,12,12,12,1,12,36060,juiz de fora,MG
4,1,credit_card,2,128.45,delivered,1,2.0,1.0,128.45,2.0,...,13,21,5,5,6,6,5,18570,conchas,SP


In [24]:
feature_matrix.info() #check the feature matrix information 

<class 'pandas.core.frame.DataFrame'>
Index: 103886 entries, 0 to 103885
Data columns (total 25 columns):
 #   Column                                          Non-Null Count   Dtype   
---  ------                                          --------------   -----   
 0   payment_sequential                              103886 non-null  int64   
 1   payment_type                                    103886 non-null  category
 2   payment_installments                            103886 non-null  int64   
 3   payment_value                                   103886 non-null  float64 
 4   orderdata.order_status                          103886 non-null  category
 5   orderdata.COUNT(paydata)                        103886 non-null  Int64   
 6   orderdata.MEAN(paydata.payment_installments)    103886 non-null  float64 
 7   orderdata.MEAN(paydata.payment_sequential)      103886 non-null  float64 
 8   orderdata.MEAN(paydata.payment_value)           103886 non-null  float64 
 9   orderdata.SUM(paydat

Original dataframes have orginally around 17 features but using DFS we could produce  extra Features which can be used to run ML models, this is particularly useful when datasets are huge and manual feature engineering is not an option 

In [25]:
# Import Sklearn libraries to perform preprocessing and running as a pipeline

from sklearn.model_selection import train_test_split # to split the dataframe
from sklearn.preprocessing import StandardScaler, OneHotEncoder # for preprocessing the data
from sklearn.impute import SimpleImputer # to impute the missing values if any
from sklearn.compose import ColumnTransformer # preprocess subsets of data 
from sklearn.pipeline import Pipeline # pipeline allows to perform series of preprocessing steps on the data

In [26]:
# Separate features and target
X = feature_matrix.drop(columns=['payment_type'])
y = feature_matrix['payment_type']

In [27]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['category', 'object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

In [28]:
# Preprocessing pipelines for both numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')) #impute by median values
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),#impute by most frequent values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))#one hot encode the categorical data
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols), # initiates transformation on numerical data
        ('cat', categorical_transformer, categorical_cols) #initiates transformation on categorical data
    ])

# Apply preprocessing 
X_preprocessed = preprocessor.fit_transform(X)

In [29]:
#  use label encoding on target column
from sklearn.preprocessing import LabelEncoder

In [30]:
label_encoder = LabelEncoder() #initiate label encoding

In [31]:
y_encoded = label_encoder.fit_transform(y) #transform the labels to numerical

In [32]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_encoded, test_size=0.25, random_state=42)

### ML models

LOGISTIC REGRESSION

In [33]:
from sklearn.linear_model import LogisticRegression
lrmodel = LogisticRegression(penalty='elasticnet',solver='saga',l1_ratio=0.3)
lrmodel.fit(X_train, y_train)

In [34]:
lry_pred = lrmodel.predict(X_test)#predict on test data 

XGBOOST 

In [35]:
import xgboost as xgb
from xgboost import XGBClassifier #import xgboost

In [36]:
xgb_model = XGBClassifier(max_depth=3,n_estimators=25,gamma=0.2,learning_rate=0.001,min_child_weight=5,reg_lambda=3)

In [37]:
# Train the model
xgb_model.fit(X_train, y_train)

In [38]:
# predictions on Xgboost
y_pred = xgb_model.predict(X_test) 

In [39]:
from sklearn.metrics import classification_report # import classification report to check the various metrics

LOGISTIC REGRESSION METRICS

In [40]:
print(classification_report(y_test, lry_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5011
           1       0.74      1.00      0.85     19130
           2       0.00      0.00      0.00       356
           3       0.00      0.00      0.00         1
           4       1.00      0.02      0.04      1474

    accuracy                           0.74     25972
   macro avg       0.35      0.20      0.18     25972
weighted avg       0.60      0.74      0.63     25972



XGBOOST METRICS

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.45      0.49      5011
           1       0.84      0.91      0.88     19130
           2       0.82      0.03      0.05       356
           3       0.00      0.00      0.00         1
           4       0.98      0.73      0.84      1474

    accuracy                           0.80     25972
   macro avg       0.63      0.42      0.45     25972
weighted avg       0.79      0.80      0.79     25972



CONCLUSION

Hence by using feature tools , we could generate the transformations and aggregations of features and use these features for ML modelling

The ML model performance can be improved by right set of features and hyper parameter tuning