# Customers Feature Engineering

In [2]:
import random

import pandas as pd

In [3]:
import zipfile
from pathlib import Path

# Setup path to data folder
data_path = Path('../data/')
unzipped_data_path = data_path / 'unzipped'

# Unzip the raw customers data
with zipfile.ZipFile(data_path / 'raw' / 'customers.csv.zip', 'r') as zip_ref:
    print('Unzipping customers dataset...')
    zip_ref.extractall(unzipped_data_path)

Unzipping customers dataset...


### Data Inspection

First, we inspect the customers data

In [4]:
customers_df = pd.read_csv(unzipped_data_path / 'customers.csv')
customers_df.shape

(1371980, 7)

In [5]:
customers_df.head(5)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [6]:
customers_df.iloc[random.randint(0, len(customers_df))]

customer_id               eb216d3cd0e0a8d530e3321c06d5ecce49c003119be3a5...
FN                                                                      1.0
Active                                                                  1.0
club_member_status                                                   ACTIVE
fashion_news_frequency                                            Regularly
age                                                                    49.0
postal_code               4021c8ba1aa7b64d50b707c1dbcd4880d042d6c9434cc5...
Name: 1259864, dtype: object

In [7]:
customers_df.isnull().sum()

customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16011
age                        15861
postal_code                    0
dtype: int64

### Feature Selection

Apparently, there are many rows with null values. We need to take care of those. There are many ways to do this, but we can start with choosing which features may be relevant in the development of our recommender system:
- customer_id
- club_member_status
- age
- postal_code

In [None]:
# Define the columns we want
required_columns = ['customer_id', 'club_member_status', 'age', 'postal_code']

# Check if the columns exists
missing_columns = [col for col in required_columns if col not in customers_df.columns]
if missing_columns:
    raise ValueError(f"Columns {', '.join(missing_columns)} not found in the DataFrame")

# Keep the columns we want
customers_df = customers_df[required_columns]

### Handling Missing Values in `club_members_status`

Next, let's take care of the missing values in the `club_member_status`. First, we check the possible values in this column.

In [18]:
print(customers_df['club_member_status'].value_counts())

print('\nNull values in club_member_status:')
print(len(customers_df) - customers_df['club_member_status'].value_counts().sum())

club_member_status
ACTIVE        1272491
PRE-CREATE      92960
LEFT CLUB         467
Name: count, dtype: int64

Null values in club_member_status:
6062


There are multiple possible reasons why there are null values for this column. Normally, we should consult with the data engineers or back-end engineers to clarify what does null mean in the system. But, for now, let's assume null values as not being a member. We will replace the null values with `ABSENT`.

In [19]:
customers_df['club_member_status'].fillna('ABSENT', inplace=True)
print(customers_df['club_member_status'].value_counts())

club_member_status
ACTIVE        1272491
PRE-CREATE      92960
ABSENT           6062
LEFT CLUB         467
Name: count, dtype: int64


### Handling Missing `Age` Values and Customer Segmentation

There are many ways to handle missing values for age (e.g. imputation). However, for now, let's make it simple by simply dropping rows with null age values.

In [20]:
print('Null values in age before dropping:')
print(customers_df['age'].isnull().sum())

customers_df.dropna(subset=['age'], inplace=True)

print('\nNull values in age after dropping:')
print(customers_df['age'].isnull().sum())

Null values in age before dropping:
15861

Null values in age after dropping:
0


Then, instead of using the age column as is, let's create aditional features that determines the age group of the customer. We will group the customers into the following age groups:
- 0-18
- 19-25
- 26-35
- 36-45
- 46-55
- 56-65
- 66+

In [28]:
def create_age_group(age):
    if age <= 18:
        return '0-18'
    elif 19 <= age <= 25:
        return '19-25'
    elif 26 <= age <= 35:
        return '26-35'
    elif 36 <= age <= 45:
        return '36-45'
    elif 46 <= age <= 55:
        return '46-55'
    elif 56 <= age <= 65:
        return '56-65'
    else:
        return '66+'


customers_df['age_group'] = customers_df['age'].apply(create_age_group)

In [33]:
customers_df.head(5)

Unnamed: 0,customer_id,club_member_status,age,postal_code,age_group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,ACTIVE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,46-55
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,ACTIVE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,19-25
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,ACTIVE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,19-25
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,ACTIVE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,46-55
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,ACTIVE,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,46-55


In [34]:
customers_df.shape

(1356119, 5)

### Save the Engineered Features
Lastly, we pickle it to save the processed customers data

In [30]:
customers_df.to_pickle(data_path / 'processed' / 'customers.pkl')