## Strategy

#### INFO

- End Columns: 35
- Extracted Features: 9
    - Hour
    - DayOfWeek
    - MonthOfYear
    - Is Weekend
    - Account Age (Days)
    - DebtToIncomeRatio
    - CreditUtilization
    - Age Group
    - Is Retired

In [1]:
import pandas as pd
import numpy as np

original_df = pd.read_parquet("../0 - Data/2 - Clean/clean_transactions.pq")
original_df.head()

Unnamed: 0,User,Card,Year,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,...,Zipcode,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Datetime,Is Fraud
0,0,0,2018,1,2,130.95,Chip Transaction,5817218446178736267,La Verne,CA,...,91750,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
1,0,0,2018,1,2,130.95,Chip Transaction,5817218446178736267,La Verne,CA,...,91750,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
2,0,0,2018,1,2,130.95,Chip Transaction,5817218446178736267,La Verne,CA,...,91750,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
3,0,0,2018,1,2,130.95,Chip Transaction,5817218446178736267,La Verne,CA,...,91750,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
4,0,0,2018,1,2,130.95,Chip Transaction,5817218446178736267,La Verne,CA,...,91750,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0


#### Drop Unnecessary Columns

In [2]:
# Columns to drop
columns_to_drop = [
    "User", "Card", "Merchant Name", "Merchant State", "Merchant City", "Person", "Zip", "CARD INDEX", "Card Number", 
    "CVV", "Expires", "Address", "Apartment", "City", "State", "Zipcode", "Errors?", 
    "Card on Dark Web"
]

df = original_df.drop(columns_to_drop, axis=1)
df.head()

Unnamed: 0,Year,Month,Day,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,...,Gender,Latitude,Longitude,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,Datetime,Is Fraud
0,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,Female,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
1,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,Female,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
2,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,Female,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
3,2018,1,2,130.95,Chip Transaction,5912,Visa,Credit,0,1,...,Female,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0
4,2018,1,2,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,...,Female,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2018-01-02 06:28:00,0


#### Time Features Extraction

In [3]:
df['Hour'] = df['Datetime'].dt.hour
df['MonthOfYear'] = df['Datetime'].dt.month
df['DayOfWeek'] = df['Datetime'].dt.dayofweek  # 0 is Monday, 6 is Sunday
df['Is Weekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
df.head()

Unnamed: 0,Year,Month,Day,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,...,Total Debt,FICO Score,Num Credit Cards,Datetime,Is Fraud,Transaction Hour,Transaction Day,Transaction Month,Transaction Day of Week,Is Weekend
0,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,127613.0,787,5,2018-01-02 06:28:00,0,6,2,1,1,0
1,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,127613.0,787,5,2018-01-02 06:28:00,0,6,2,1,1,0
2,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,127613.0,787,5,2018-01-02 06:28:00,0,6,2,1,1,0
3,2018,1,2,130.95,Chip Transaction,5912,Visa,Credit,0,1,...,127613.0,787,5,2018-01-02 06:28:00,0,6,2,1,1,0
4,2018,1,2,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,...,127613.0,787,5,2018-01-02 06:28:00,0,6,2,1,1,0


#### Account Age

In [4]:
# Convert 'Acct Open Date' to datetime if it is not already
df['Acct Open Date'] = pd.to_datetime(df['Acct Open Date'])

# Calculate account age in days at the time of the transaction
df['Account Age (Days)'] = (df['Datetime'] - df['Acct Open Date']).dt.days

df.head()

Unnamed: 0,Year,Month,Day,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,...,FICO Score,Num Credit Cards,Datetime,Is Fraud,Transaction Hour,Transaction Day,Transaction Month,Transaction Day of Week,Is Weekend,Account Age (Days)
0,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,787,5,2018-01-02 06:28:00,0,6,2,1,1,0,5602
1,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,787,5,2018-01-02 06:28:00,0,6,2,1,1,0,1372
2,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,787,5,2018-01-02 06:28:00,0,6,2,1,1,0,5299
3,2018,1,2,130.95,Chip Transaction,5912,Visa,Credit,0,1,...,787,5,2018-01-02 06:28:00,0,6,2,1,1,0,5480
4,2018,1,2,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,...,787,5,2018-01-02 06:28:00,0,6,2,1,1,0,3410


#### Debt to Income Ratio

In [5]:
df['Debt to Income Ratio'] = df['Total Debt'] / df['Yearly Income - Person']
df.head()

Unnamed: 0,Year,Month,Day,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,...,Num Credit Cards,Datetime,Is Fraud,Transaction Hour,Transaction Day,Transaction Month,Transaction Day of Week,Is Weekend,Account Age (Days),Debt to Income Ratio
0,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,5,2018-01-02 06:28:00,0,6,2,1,1,0,5602,2.137714
1,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,5,2018-01-02 06:28:00,0,6,2,1,1,0,1372,2.137714
2,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,5,2018-01-02 06:28:00,0,6,2,1,1,0,5299,2.137714
3,2018,1,2,130.95,Chip Transaction,5912,Visa,Credit,0,1,...,5,2018-01-02 06:28:00,0,6,2,1,1,0,5480,2.137714
4,2018,1,2,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,...,5,2018-01-02 06:28:00,0,6,2,1,1,0,3410,2.137714


#### Credit Utilization

In [6]:
df['Credit Utilization'] = np.where(df['Credit Limit'] == 0, 1, df['Amount'] / df['Credit Limit'])
df.head()

Unnamed: 0,Year,Month,Day,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,...,Datetime,Is Fraud,Transaction Hour,Transaction Day,Transaction Month,Transaction Day of Week,Is Weekend,Account Age (Days),Debt to Income Ratio,Credit Utilization
0,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,2018-01-02 06:28:00,0,6,2,1,1,0,5602,2.137714,0.00539
1,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,2018-01-02 06:28:00,0,6,2,1,1,0,1372,2.137714,0.005961
2,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,2018-01-02 06:28:00,0,6,2,1,1,0,5299,2.137714,0.002821
3,2018,1,2,130.95,Chip Transaction,5912,Visa,Credit,0,1,...,2018-01-02 06:28:00,0,6,2,1,1,0,5480,2.137714,0.01056
4,2018,1,2,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,...,2018-01-02 06:28:00,0,6,2,1,1,0,3410,2.137714,4.676786


#### Age Groups

In [7]:
df['Age Group'] = pd.cut(df['Current Age'], bins=[0, 25, 35, 45, 60, 100], labels=['18-25', '26-35', '36-45', '46-60', '60+'])
df.head()

Unnamed: 0,Year,Month,Day,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,...,Is Fraud,Transaction Hour,Transaction Day,Transaction Month,Transaction Day of Week,Is Weekend,Account Age (Days),Debt to Income Ratio,Credit Utilization,Age Group
0,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,0,6,2,1,1,0,5602,2.137714,0.00539,46-60
1,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,0,6,2,1,1,0,1372,2.137714,0.005961,46-60
2,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,0,6,2,1,1,0,5299,2.137714,0.002821,46-60
3,2018,1,2,130.95,Chip Transaction,5912,Visa,Credit,0,1,...,0,6,2,1,1,0,5480,2.137714,0.01056,46-60
4,2018,1,2,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,...,0,6,2,1,1,0,3410,2.137714,4.676786,46-60


#### Retirement

In [8]:
df['Is Retired'] = df['Current Age'] >= df['Retirement Age']
df.head()

Unnamed: 0,Year,Month,Day,Amount,Use Chip,MCC,Card Brand,Card Type,Has Chip,Cards Issued,...,Transaction Hour,Transaction Day,Transaction Month,Transaction Day of Week,Is Weekend,Account Age (Days),Debt to Income Ratio,Credit Utilization,Age Group,Is Retired
0,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,6,2,1,1,0,5602,2.137714,0.00539,46-60,False
1,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,6,2,1,1,0,1372,2.137714,0.005961,46-60,False
2,2018,1,2,130.95,Chip Transaction,5912,Visa,Debit,1,2,...,6,2,1,1,0,5299,2.137714,0.002821,46-60,False
3,2018,1,2,130.95,Chip Transaction,5912,Visa,Credit,0,1,...,6,2,1,1,0,5480,2.137714,0.01056,46-60,False
4,2018,1,2,130.95,Chip Transaction,5912,Mastercard,Debit (Prepaid),1,1,...,6,2,1,1,0,3410,2.137714,4.676786,46-60,False


In [10]:
df.columns

Index(['Year', 'Month', 'Day', 'Amount', 'Use Chip', 'MCC', 'Card Brand',
       'Card Type', 'Has Chip', 'Cards Issued', 'Credit Limit',
       'Acct Open Date', 'Year PIN last Changed', 'Current Age',
       'Retirement Age', 'Birth Year', 'Birth Month', 'Gender', 'Latitude',
       'Longitude', 'Per Capita Income - Zipcode', 'Yearly Income - Person',
       'Total Debt', 'FICO Score', 'Num Credit Cards', 'Datetime', 'Is Fraud',
       'Transaction Hour', 'Transaction Day', 'Transaction Month',
       'Transaction Day of Week', 'Is Weekend', 'Account Age (Days)',
       'Debt to Income Ratio', 'Credit Utilization', 'Age Group',
       'Is Retired'],
      dtype='object')

#### Save the Featured Data

In [9]:
# df.to_parquet("../0 - Data/featured_transactions.pq")