In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('merged-df.csv')


In [2]:
df = df.drop(columns=['user_address','card_has_chip','card_cvv','card_year_pin_last_changed','card_card_on_dark_web'],axis=1)

df.columns
df.shape

(13305915, 31)

In [3]:
df_user = df[['user_id','user_gender','user_current_age','user_birth_year', 'user_yearly_income', 'user_per_capita_income',
               'user_credit_score', 'user_total_debt','user_retirement_age', 'user_birth_month',
               'card_num_cards_issued','user_latitude','user_longitude']]

df_user.describe(include='all')


df_user = (
    df_user.groupby('user_id')
    .agg({
        'user_gender': 'first',
        'user_current_age': 'first',
        'user_birth_year': 'first',
        'user_yearly_income': 'first',
        'user_per_capita_income': 'first',
        'user_credit_score': 'first',
        'user_total_debt': 'first',
        'user_retirement_age': 'first',
        'user_birth_month': 'first',
        'card_num_cards_issued': 'first',
        'user_latitude': 'first',
        'user_longitude': 'first',
        
    })
    .reset_index()
)

In [4]:
# Feature engineering
# year to retirement
df_user['year_to_retirement'] = np.where(
    df_user['user_current_age'] >= df_user['user_retirement_age'],
    0,df_user['user_retirement_age'] - df_user['user_current_age'])

# total_dept to yearly income
df_user['debt_to_yearly_income_ratio'] = (df_user['user_total_debt'] / 
                                          df_user['user_yearly_income']).replace([np.inf, -np.inf], 0)
# income per card
df_user['income_per_card'] = (df_user['user_yearly_income'] / 
                              df_user['card_num_cards_issued']).replace([np.inf, -np.inf], 0)
# multiple cards user
df_user['multi_card_user'] = (df_user['card_num_cards_issued'] > 1).astype(int)

In [5]:
# transaction data
txn_agg = (df.groupby('user_id').agg(
        total_txn_count=('transaction_id', 'count'),
        total_amount=('amount', 'sum'),
        avg_amount=('amount', 'mean'),
        std_amount=('amount', 'std'),
        median_amount=('amount', 'median'),
        first_txn=('date', 'min'),
        last_txn=('date', 'max')
    ).reset_index())

all_df = txn_agg.merge(df_user, on='user_id', how='inner')



In [6]:
all_df.shape

(1219, 24)

Anouksha's Feature

In [7]:
from datetime import datetime

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['card_acct_open_date'] = pd.to_datetime(df['card_acct_open_date'], errors='coerce')

monthly_txn = df.groupby(['user_id', df['date'].dt.to_period('M')])['amount'].sum().reset_index()
user_monthly_avg = monthly_txn.groupby('user_id')['amount'].mean().rename('avg_monthly_spend')

income_util = (
    user_monthly_avg.to_frame()
    .join(df[['user_id', 'user_yearly_income']].drop_duplicates().set_index('user_id'), how='left')
)
income_util['income_utilization_ratio'] = income_util.apply(
    lambda r: 0 if pd.isna(r['user_yearly_income']) or r['user_yearly_income'] == 0 
    else (r['avg_monthly_spend'] / (r['user_yearly_income'] / 12)), axis=1
)
income_util['income_utilization_ratio'] = income_util['income_utilization_ratio'].clip(0, 5) 

credit_util = (
    df.groupby('user_id')['amount'].mean().rename('avg_txn_amount')
    .to_frame()
    .join(df[['user_id', 'card_credit_limit']].drop_duplicates().set_index('user_id'), how='left')
)
credit_util['credit_utilization_ratio'] = credit_util.apply(
    lambda r: 0 if pd.isna(r['card_credit_limit']) or r['card_credit_limit'] == 0 
    else r['avg_txn_amount'] / r['card_credit_limit'], axis=1
)
credit_util['credit_utilization_ratio'] = credit_util['credit_utilization_ratio'].clip(0, 1)


credit_util_user = (
    credit_util
      .groupby(level=0)['credit_utilization_ratio']
      .agg(
          credit_util_mean='mean',    
          credit_util_max='max',      
          credit_util_median='median'
      )
)




today_year = datetime.now().year
card_age = df[['user_id', 'card_acct_open_date']].drop_duplicates().set_index('user_id')
card_age['card_age_years'] = (today_year - card_age['card_acct_open_date'].dt.year).clip(lower=0)

# Merge new features together
card_age_user = (
    card_age
      .groupby(level=0)['card_age_years']
      .agg(
          oldest_card_age_years='max',   # longest history
          avg_card_age_years='mean'      # overall maturity
      )
)
feat_income = income_util[['income_utilization_ratio']]

feat_credit = (
    credit_util_user[['credit_util_mean']]
      .rename(columns={'credit_util_mean': 'credit_utilization_ratio'})
)
feat_card_age = card_age_user[['oldest_card_age_years']]
user_index = pd.Index(df['user_id'].unique(), name='user_id')

final_features = (
    pd.DataFrame(index=user_index)
      .join(feat_income, how='left')
      .join(feat_credit, how='left')
      .join(feat_card_age, how='left')
)
final_features

Unnamed: 0_level_0,income_utilization_ratio,credit_utilization_ratio,oldest_card_age_years
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1556,0.481771,0.403053,17
561,0.871597,0.079334,20
1129,2.092357,0.004161,19
430,2.114307,0.084081,25
848,0.697810,0.004444,20
...,...,...,...
569,0.610228,0.006877,9
616,0.769703,0.007023,9
1862,0.926983,0.002574,9
1527,0.874030,0.007073,8


In [8]:
all_df = all_df.merge(final_features, on='user_id', how='inner')

In [9]:
all_df.shape

(1219, 27)

Vi's Feature

In [10]:
df = pd.read_csv('merged-df.csv')

In [11]:
def _map_binary(series):
    truthy = {'y', 'yes', 'true', '1', 't'}
    falsy = {'n', 'no', 'false', '0', 'f'}
    s = series.astype(str).str.strip().str.lower()
    mapped = np.select([s.isin(truthy), s.isin(falsy)], [1, 0], default=np.nan)
    return pd.Series(mapped, index=series.index, dtype='float64')

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['card_acct_open_date'] = pd.to_datetime(df['card_acct_open_date'], errors='coerce')
df['card_expires'] = pd.to_datetime(df['card_expires'], errors='coerce')

df['use_chip_flag'] = _map_binary(df['use_chip']).fillna(0.0)
df['card_has_chip_flag'] = _map_binary(df['card_has_chip']).fillna(0.0)
df['card_on_dark_web_flag'] = _map_binary(df['card_card_on_dark_web']).fillna(0.0)

df['user_gender_code'], _ = pd.factorize(df['user_gender'])
df['card_brand_code'], _ = pd.factorize(df['card_card_brand'])
df['card_type_code'], _ = pd.factorize(df['card_card_type'])
df['merchant_state_code'], _ = pd.factorize(df['merchant_state'])
df['category_code'], _ = pd.factorize(df['category'])
df['mcc_code'], _ = pd.factorize(df['mcc'])

numeric_columns = [
    'user_current_age', 'user_retirement_age', 'user_birth_year', 'user_birth_month',
    'user_per_capita_income', 'user_yearly_income', 'user_total_debt',
    'user_credit_score', 'user_num_credit_cards', 'card_num_cards_issued',
    'card_credit_limit', 'card_year_pin_last_changed', 'user_latitude', 'user_longitude',
    'zip'
 ]
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
df['card_cvv'] = pd.to_numeric(df['card_cvv'], errors='coerce')

# weekend and spend helpers
df['is_weekend'] = df['date'].dt.weekday >= 5
df['positive_amount'] = df['amount'].clip(lower=0)
df['weekend_positive_amount'] = np.where(df['is_weekend'], df['positive_amount'], 0)

print("Preprocessing complete. Sample of engineered columns:")
df[['use_chip_flag', 'card_has_chip_flag', 'card_on_dark_web_flag', 'card_brand_code', 'card_type_code']].head()

Preprocessing complete. Sample of engineered columns:


Unnamed: 0,use_chip_flag,card_has_chip_flag,card_on_dark_web_flag,card_brand_code,card_type_code
0,0.0,1.0,0.0,0,0
1,0.0,1.0,0.0,0,1
2,0.0,1.0,0.0,0,2
3,0.0,0.0,0.0,0,2
4,0.0,1.0,0.0,1,2


In [12]:
def most_frequent(series):
    if series.empty:
        return np.nan
    mode = series.mode(dropna=True)
    return mode.iloc[0] if not mode.empty else np.nan

customer_agg = df.groupby('user_id').agg(
    first_txn_date=('date', 'min'),
    last_txn_date=('date', 'max'),
    txn_count=('transaction_id', 'count'),
    total_spent=('amount', 'sum'),
    avg_spent=('amount', 'mean'),
    max_spent=('amount', 'max'),
    merchant_count=('merchant_id', 'nunique'),
    merchant_city_count=('merchant_city', 'nunique'),
    merchant_state_count=('merchant_state', 'nunique'),
    merchant_zip_count=('zip', 'nunique'),
    mcc_count=('mcc', 'nunique'),
    category_count=('category', 'nunique'),
    unique_cards=('card_id', 'nunique'),
    card_credit_limit=('card_credit_limit', 'first'),
    card_year_pin_last_changed=('card_year_pin_last_changed', 'first'),
    card_acct_open_date=('card_acct_open_date', 'first'),
    card_expires=('card_expires', 'first'),
    user_current_age=('user_current_age', 'first'),
    user_retirement_age=('user_retirement_age', 'first'),
    user_gender_code=('user_gender_code', 'first'),
    user_per_capita_income=('user_per_capita_income', 'first'),
    user_yearly_income=('user_yearly_income', 'first'),
    user_total_debt=('user_total_debt', 'first'),
    user_credit_score=('user_credit_score', 'first'),
    user_num_credit_cards=('user_num_credit_cards', 'first'),
    most_frequent_city=('merchant_city', most_frequent),
    most_frequent_category=('category', most_frequent),
    positive_spend=('positive_amount', 'sum'),
    weekend_positive_spend=('weekend_positive_amount', 'sum')
).reset_index()

customer_agg['first_txn_date'] = pd.to_datetime(customer_agg['first_txn_date'])
customer_agg['last_txn_date'] = pd.to_datetime(customer_agg['last_txn_date'])

reference_date = df['date'].max() + pd.Timedelta(days=1)

customer_agg['recency_days'] = (reference_date - customer_agg['last_txn_date']).dt.days
customer_agg['tenure_days'] = (customer_agg['last_txn_date'] - customer_agg['first_txn_date']).dt.days + 1
customer_agg.loc[customer_agg['tenure_days'] < 1, 'tenure_days'] = 1
customer_agg['txn_per_day'] = customer_agg['txn_count'] / customer_agg['tenure_days']

customer_agg['spend_per_txn'] = customer_agg['total_spent'] / customer_agg['txn_count'].replace(0, np.nan)
customer_agg['merchant_count_per_month'] = customer_agg['merchant_count'] / (customer_agg['tenure_days'] / 30.4375)
customer_agg['merchant_count_per_month'].replace([np.inf, -np.inf], np.nan, inplace=True)

customer_agg['card_account_age_years'] = ((reference_date - customer_agg['card_acct_open_date']).dt.days / 365.25)
customer_agg['card_account_age_years'] = customer_agg['card_account_age_years'].clip(lower=0)
customer_agg['card_time_to_expiry_years'] = ((customer_agg['card_expires'] - reference_date).dt.days / 365.25)
customer_agg['card_time_to_expiry_years'] = customer_agg['card_time_to_expiry_years'].fillna(0)

customer_agg['debt_to_income_ratio'] = customer_agg['user_total_debt'] / customer_agg['user_yearly_income'].replace(0, np.nan)
customer_agg['income_to_debt_ratio'] = customer_agg['user_yearly_income'] / customer_agg['user_total_debt'].replace(0, np.nan)
customer_agg['spend_to_limit_ratio'] = customer_agg['total_spent'] / customer_agg['card_credit_limit'].replace(0, np.nan)

customer_agg['weekend_spend_ratio'] = np.where(
    customer_agg['positive_spend'] > 0,
    customer_agg['weekend_positive_spend'] / customer_agg['positive_spend'],
    0
)

customer_agg.replace([np.inf, -np.inf], np.nan, inplace=True)

credit_score_min, credit_score_max = 300, 850
credit_score_range = credit_score_max - credit_score_min
credit_score_component = 1 - (
    (
        customer_agg['user_credit_score']
        .clip(lower=credit_score_min, upper=credit_score_max)
        - credit_score_min
    ) / credit_score_range
)
credit_score_component = credit_score_component.fillna(0).clip(0, 1)

dti_component = (
    customer_agg['debt_to_income_ratio']
    .fillna(0)
    .clip(lower=0)
    / 1.5
).clip(0, 1)

spend_component = customer_agg['spend_to_limit_ratio'].fillna(0).clip(0, 1)

customer_agg['risk_of_default_score'] = (
    0.5 * credit_score_component
    + 0.3 * dti_component
    + 0.2 * spend_component
).clip(0, 1)

risk_level_bins = [-0.001, 0.33, 0.66, 1.0]
risk_level_labels = ['Low', 'Medium', 'High']
customer_agg['risk_level'] = pd.cut(
    customer_agg['risk_of_default_score'],
    bins=risk_level_bins,
    labels=risk_level_labels,
    include_lowest=True
)
customer_agg['risk_level'] = customer_agg['risk_level'].cat.add_categories(['Unscored']).fillna('Unscored')
customer_agg['risk_level'] = customer_agg['risk_level'].astype(pd.api.types.CategoricalDtype(
    categories=['Low', 'Medium', 'High', 'Unscored'],
    ordered=True
))

customer_agg.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_agg['merchant_count_per_month'].replace([np.inf, -np.inf], np.nan, inplace=True)


Unnamed: 0,user_id,first_txn_date,last_txn_date,txn_count,total_spent,avg_spent,max_spent,merchant_count,merchant_city_count,merchant_state_count,...,spend_per_txn,merchant_count_per_month,card_account_age_years,card_time_to_expiry_years,debt_to_income_ratio,income_to_debt_ratio,spend_to_limit_ratio,weekend_spend_ratio,risk_of_default_score,risk_level
0,0,2010-01-01 13:10:00,2019-10-31 19:14:00,12795,625799.67,48.909705,1128.47,518,288,43,...,48.909705,4.390595,12.167009,0.079398,0.607233,1.646813,35.556799,0.278277,0.400538,Medium
1,1,2010-01-01 11:58:00,2019-10-31 16:15:00,10073,336187.37,33.375099,937.15,189,76,21,...,33.375099,1.601974,12.167009,2.080767,0.321583,3.109618,26.264638,0.289232,0.397044,Medium
2,2,2010-01-01 06:47:00,2019-10-31 16:21:00,10612,291534.27,27.472132,519.02,146,37,10,...,27.472132,1.237503,14.417522,0.999316,2.945677,0.339481,39.396523,0.280503,0.660909,High
3,3,2010-01-02 17:15:00,2019-10-31 14:30:00,6001,280685.46,46.773114,990.2,162,66,23,...,46.773114,1.373885,12.670773,2.16564,0.668969,1.494838,4318.237846,0.283297,0.48743,Medium
4,4,2010-01-01 06:45:00,2019-10-31 21:26:00,15043,595722.36,39.6013,1624.15,449,187,37,...,39.6013,3.805747,10.417522,-5.672827,1.509361,0.662532,30.866444,0.290495,0.621818,Medium


In [13]:
new_features1 = customer_agg[['user_id','weekend_spend_ratio', 'risk_of_default_score', 'risk_level']]

In [14]:
monthly_total_spend = (
    df.set_index('date')['amount']
      .resample('MS')
      .sum()
      .rename('total_monthly_spend')
      .to_frame()
)

monthly_total_spend['monthly_diff'] = monthly_total_spend['total_monthly_spend'].diff().fillna(0)
overall_diff = monthly_total_spend['monthly_diff']
overall_months = monthly_total_spend.index

user_monthly_spend = (
    df.groupby(['user_id', pd.Grouper(key='date', freq='MS')])['amount']
      .sum()
      .reset_index()
      .rename(columns={'date': 'month', 'amount': 'user_monthly_spend'})
)

def _corr_with_overall(group):
    aligned = (
        group.set_index('month')
             .reindex(overall_months, fill_value=0)['user_monthly_spend']
    )
    user_diff = aligned.diff().fillna(0)
    if np.isclose(user_diff.var(), 0) or np.isclose(overall_diff.var(), 0):
        return 0.0
    return user_diff.corr(overall_diff)

monthly_correlations = (
    user_monthly_spend.groupby('user_id')
    .apply(_corr_with_overall)
    .rename('correlation_with_trend')
    .reset_index()
)

corr_threshold = 0.25
monthly_correlations['trend_category'] = np.where(
    monthly_correlations['correlation_with_trend'] >= corr_threshold,
    'Follows Trend',
    'Does Not Follow Trend'
)

trend_summary = (
    monthly_correlations.merge(
        customer_agg[['user_id', 'total_spent']],
        on='user_id',
        how='left'
)
    .groupby('trend_category')
    .agg(
        user_count=('user_id', 'nunique'),
        avg_total_spent=('total_spent', 'mean')
    )
    .round({'avg_total_spent': 2})
    .reindex(['Follows Trend', 'Does Not Follow Trend'])
)

display(monthly_total_spend.reset_index())
display(monthly_correlations.head())
display(trend_summary)

  .apply(_corr_with_overall)


Unnamed: 0,date,total_monthly_spend,monthly_diff
0,2010-01-01,4372532.12,0.00
1,2010-02-01,4103170.24,-269361.88
2,2010-03-01,4539853.38,436683.14
3,2010-04-01,4407951.33,-131902.05
4,2010-05-01,4610601.80,202650.47
...,...,...,...
113,2019-06-01,4960598.66,-88493.78
114,2019-07-01,5043365.33,82766.67
115,2019-08-01,5028081.01,-15284.32
116,2019-09-01,4850069.83,-178011.18


Unnamed: 0,user_id,correlation_with_trend,trend_category
0,0,0.232929,Does Not Follow Trend
1,1,0.285515,Follows Trend
2,2,0.18836,Does Not Follow Trend
3,3,0.139144,Does Not Follow Trend
4,4,0.144842,Does Not Follow Trend


Unnamed: 0_level_0,user_count,avg_total_spent
trend_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Follows Trend,312,588324.41
Does Not Follow Trend,907,428090.74


In [15]:
new_features2 = monthly_correlations[['user_id','correlation_with_trend', 'trend_category']]

In [16]:
all_df = all_df.merge(new_features1, on='user_id', how='inner')

all_df = all_df.merge(new_features2, on='user_id', how='inner')

In [18]:
all_df.to_csv('final_features.csv', index=False)

In [22]:
check_df = pd.read_csv('final_features.csv')

In [26]:
check_df.shape

(1219, 31)