## FE - Strategy - 1

#### INFO

- End Columns: 35
- Extracted Features: 16
    - Hour
    - DayOfWeek
    - MonthOfYear
    - TimeSinceLastTransaction
    - AvgTransactionAmountWeek
    - AvgTransactionAmountMonth
    - AmountToCreditLimitRatio
    - IncomeToSpendingRatioZip
    - IncomeToSpendingRatioPerson
    - DebtToIncomeRatio
    - CardUsageRatio
    - YearsToRetirement
    - Account Age (Days)
    - Age Group
    - Is Retired
    - Bad PIN Error

In [1]:
import pandas as pd
import numpy as np

original_df = pd.read_parquet("../0 - Data/2 - Clean/clean_transactions.pq")
original_df.head(2)

Unnamed: 0,User,Card,Year,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,...,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Datetime,Is Fraud
0,0,0,2018,1,2,130.95,Chip Transaction,5817218446178736267,La Verne,CA,...,91750,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
1,0,0,2018,1,2,130.95,Chip Transaction,5817218446178736267,La Verne,CA,...,91750,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0


Drop Columns with Low Correlation to Fraud

In [2]:
# Columns to drop
columns_to_drop = [ "Merchant City", "Merchant State", "Year", "Month", "Day", "Person", "Zip", "CARD INDEX", "Card Number", 
    "CVV", "Expires", "Address", "Apartment", "City", "State", "Zipcode", "Card on Dark Web"
]

df = original_df.drop(columns_to_drop, axis=1)
df.head(2)

Unnamed: 0,User,Card,Amount,Use Chip,Merchant Name,MCC,Errors?,Card Brand,Card Type,Has Chip,...,Gender,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Datetime,Is Fraud
0,0,0,130.95,Chip Transaction,5817218446178736267,5912,0,Visa,Debit,1,...,Female,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
1,0,0,130.95,Chip Transaction,5817218446178736267,5912,0,Visa,Debit,1,...,Female,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0


Time Based Features

In [3]:
# Convert to datetime if not already done
df['Datetime'] = pd.to_datetime(df['Datetime'], errors='coerce')

# Extract hour, day of the week, and month
df['Hour'] = df['Datetime'].dt.hour
df['DayOfWeek'] = df['Datetime'].dt.dayofweek  # Monday=0, Sunday=6
df['MonthOfYear'] = df['Datetime'].dt.month

Time Since the Last Transaction

In [4]:
df = df.sort_values(['User', 'Datetime'])
df['TimeSinceLastTransaction'] = df.groupby('User')['Datetime'].diff().dt.total_seconds() / 60  # in minutes
df['TimeSinceLastTransaction'] = df['TimeSinceLastTransaction'].fillna(0)

df.head(2)

Unnamed: 0,User,Card,Amount,Use Chip,Merchant Name,MCC,Errors?,Card Brand,Card Type,Has Chip,...,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Datetime,Is Fraud,Hour,DayOfWeek,MonthOfYear,TimeSinceLastTransaction
6780,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Visa,Debit,1,...,59696.0,127613.0,787,5,2018-01-01 05:36:00,0,5,0,1,0.0
6781,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Visa,Debit,1,...,59696.0,127613.0,787,5,2018-01-01 05:36:00,0,5,0,1,0.0


AVG Transaction Amount / Week / Month

In [5]:
df['AvgTransactionAmountWeek'] = df.groupby('User')['Amount'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
df['AvgTransactionAmountMonth'] = df.groupby('User')['Amount'].transform(lambda x: x.rolling(window=30, min_periods=1).mean())

df.head(2)

Unnamed: 0,User,Card,Amount,Use Chip,Merchant Name,MCC,Errors?,Card Brand,Card Type,Has Chip,...,FICO Score,Num Credit Cards,Datetime,Is Fraud,Hour,DayOfWeek,MonthOfYear,TimeSinceLastTransaction,AvgTransactionAmountWeek,AvgTransactionAmountMonth
6780,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Visa,Debit,1,...,787,5,2018-01-01 05:36:00,0,5,0,1,0.0,123.97,123.97
6781,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Visa,Debit,1,...,787,5,2018-01-01 05:36:00,0,5,0,1,0.0,123.97,123.97


Transaction Amount to Credit Card Limit Ratio

In [6]:
def handle_inf(col: str):
    df[col] = df[col].replace([np.inf, -np.inf], np.nan)
    max_ratio = df[col].max()
    df[col] = df[col].replace(np.nan, max_ratio)

df['AmountToCreditLimitRatio'] = df['Amount'] / df['Credit Limit']
handle_inf("AmountToCreditLimitRatio")

Income to Spendings Ratio / Zipcode / Person

In [7]:
df['IncomeToSpendingRatioZip'] = df['Per Capita Income - Zipcode'] / df['Amount']
handle_inf('IncomeToSpendingRatioZip')

df['IncomeToSpendingRatioPerson'] = df['Yearly Income - Person'] / df['Amount']
handle_inf('IncomeToSpendingRatioPerson')

Debt to Incom Ratio

In [8]:
df['DebtToIncomeRatio'] = df['Total Debt'] / df['Yearly Income - Person']

Cards Usage Ratio

In [9]:
df['CardUsageRatio'] = df['Num Credit Cards'] / df['Cards Issued']

Years to Retirement

In [10]:
df['YearsToRetirement'] = df['Retirement Age'] - df['Current Age']

Account Age

In [11]:
df['Acct Open Date'] = pd.to_datetime(df['Acct Open Date'])

df['Account Age (Days)'] = (df['Datetime'] - df['Acct Open Date']).dt.days

Age Groups

In [12]:
df['Age Group'] = pd.cut(df['Current Age'], bins=[0, 25, 35, 45, 60, 100], labels=['18-25', '26-35', '36-45', '46-60', '60+'])

Retirement

In [13]:
df['Is Retired'] = df['Current Age'] >= df['Retirement Age']

df.head()

Unnamed: 0,User,Card,Amount,Use Chip,Merchant Name,MCC,Errors?,Card Brand,Card Type,Has Chip,...,AvgTransactionAmountMonth,AmountToCreditLimitRatio,IncomeToSpendingRatioZip,IncomeToSpendingRatioPerson,DebtToIncomeRatio,CardUsageRatio,YearsToRetirement,Account Age (Days),Age Group,Is Retired
6780,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Visa,Debit,1,...,123.97,0.005103,236.170041,481.535855,2.137714,2.5,13,5601,46-60,False
6781,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Visa,Debit,1,...,123.97,0.005643,236.170041,481.535855,2.137714,2.5,13,1371,46-60,False
6782,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Visa,Debit,1,...,123.97,0.002671,236.170041,481.535855,2.137714,2.5,13,5298,46-60,False
6783,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Visa,Credit,0,...,123.97,0.009998,236.170041,481.535855,2.137714,5.0,13,5479,46-60,False
6784,0,3,123.97,Swipe Transaction,2027553650310142703,5541,0,Mastercard,Debit (Prepaid),1,...,123.97,4.4275,236.170041,481.535855,2.137714,5.0,13,3409,46-60,False


Is Bad PIN in Errors

In [14]:
df['Bad PIN Error'] = df['Errors?'] == "Bad PIN"

Select the columns to drop

In [15]:
df.columns

Index(['User', 'Card', 'Amount', 'Use Chip', 'Merchant Name', 'MCC', 'Errors?',
       'Card Brand', 'Card Type', 'Has Chip', 'Cards Issued', 'Credit Limit',
       'Acct Open Date', 'Year PIN last Changed', 'Current Age',
       'Retirement Age', 'Birth Year', 'Birth Month', 'Gender', 'Latitude',
       'Longitude', 'Per Capita Income - Zipcode', 'Yearly Income - Person',
       'Total Debt', 'FICO Score', 'Num Credit Cards', 'Datetime', 'Is Fraud',
       'Hour', 'DayOfWeek', 'MonthOfYear', 'TimeSinceLastTransaction',
       'AvgTransactionAmountWeek', 'AvgTransactionAmountMonth',
       'AmountToCreditLimitRatio', 'IncomeToSpendingRatioZip',
       'IncomeToSpendingRatioPerson', 'DebtToIncomeRatio', 'CardUsageRatio',
       'YearsToRetirement', 'Account Age (Days)', 'Age Group', 'Is Retired',
       'Bad PIN Error'],
      dtype='object')

In [16]:
df = df.drop(["User", "Card", "Merchant Name", "Errors?", "Card Brand", "Acct Open Date", "Year PIN last Changed", "Birth Year", "Birth Month"], axis=1)
df.head(2)

Unnamed: 0,Amount,Use Chip,MCC,Card Type,Has Chip,Cards Issued,Credit Limit,Current Age,Retirement Age,Gender,...,AmountToCreditLimitRatio,IncomeToSpendingRatioZip,IncomeToSpendingRatioPerson,DebtToIncomeRatio,CardUsageRatio,YearsToRetirement,Account Age (Days),Age Group,Is Retired,Bad PIN Error
6780,123.97,Swipe Transaction,5541,Debit,1,2,24295.0,53,66,Female,...,0.005103,236.170041,481.535855,2.137714,2.5,13,5601,46-60,False,False
6781,123.97,Swipe Transaction,5541,Debit,1,2,21968.0,53,66,Female,...,0.005643,236.170041,481.535855,2.137714,2.5,13,1371,46-60,False,False


#### Save the Featured Data

In [17]:
df.to_parquet("../0 - Data/3 - featured/ft_strategy_1.pq")