# Feature Engineering

In [1]:
#load package
import pandas as pd
import seaborn as sns
import numpy as np
import plotly.express as px
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score,confusion_matrix

In [2]:
# Set pandas option to display all columns
pd.set_option('display.max_columns', None)

In [3]:
# Load the Parquet file into a DataFrame
df = pd.read_parquet('transaction_no_duplicate.parquet')

In [4]:
df

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,posConditionCode,merchantCategoryCode,currentExpDate,accountOpenDate,dateOfLastAddressChange,cardCVV,enteredCVV,cardLast4Digits,transactionType,currentBalance,cardPresent,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000,5000.00,2016-08-13 14:27:32,98.55,Uber,US,US,02,01,rideshare,2023-06-01,2015-03-14,2015-03-14,414,414,1803,PURCHASE,0.00,False,False,False
1,737265056,737265056,5000,5000.00,2016-10-11 05:05:54,74.51,AMC #191138,US,US,09,01,entertainment,2024-02-01,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.00,True,False,False
2,737265056,737265056,5000,5000.00,2016-11-08 09:18:39,7.47,Play Store,US,US,09,01,mobileapps,2025-08-01,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.00,False,False,False
3,737265056,737265056,5000,5000.00,2016-12-10 02:14:50,7.47,Play Store,US,US,09,01,mobileapps,2025-08-01,2015-03-14,2015-03-14,486,486,767,PURCHASE,0.00,False,False,False
4,830329091,830329091,5000,5000.00,2016-03-24 21:04:46,71.18,Tim Hortons #947751,US,US,02,01,fastfood,2029-10-01,2015-08-06,2015-08-06,885,885,3143,PURCHASE,0.00,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786358,732852505,732852505,50000,48904.96,2016-12-22 18:44:12,119.92,Lyft,US,US,90,01,rideshare,2022-12-01,2012-08-23,2012-08-23,936,936,3783,PURCHASE,1095.04,False,False,False
786359,732852505,732852505,50000,48785.04,2016-12-25 16:20:34,18.89,hulu.com,US,US,09,01,online_subscriptions,2023-08-01,2012-08-23,2012-08-23,939,939,3388,PURCHASE,1214.96,False,False,False
786360,732852505,732852505,50000,48766.15,2016-12-27 15:46:24,49.43,Lyft,US,US,02,01,rideshare,2025-08-01,2012-08-23,2012-08-23,936,936,3783,PURCHASE,1233.85,False,False,False
786361,732852505,732852505,50000,48716.72,2016-12-29 00:30:55,49.89,walmart.com,US,US,09,99,online_retail,2022-07-01,2012-08-23,2012-08-23,939,939,3388,PURCHASE,1283.28,False,False,False


In [5]:
# check the shape(we start from the transaction without duplicates so the len is reduced)
"Dataset Dimensions: {}x{}".format(df.shape[0], df.shape[1])

'Dataset Dimensions: 740001x23'

In [6]:
# check the null(we have already removed fully null columns in previous steps)
df.isnull().sum()

accountNumber                  0
customerId                     0
creditLimit                    0
availableMoney                 0
transactionDateTime            0
transactionAmount              0
merchantName                   0
acqCountry                  4289
merchantCountryCode          686
posEntryMode                3814
posConditionCode             383
merchantCategoryCode           0
currentExpDate                 0
accountOpenDate                0
dateOfLastAddressChange        0
cardCVV                        0
enteredCVV                     0
cardLast4Digits                0
transactionType              680
currentBalance                 0
cardPresent                    0
expirationDateKeyInMatch       0
isFraud                        0
dtype: int64

In [7]:
# Replace missing values with 'Missing' for all specified columns
columns_to_fill_with_missing = ['acqCountry', 'posEntryMode', 'merchantCountryCode', 'transactionType', 'posConditionCode']

for column in columns_to_fill_with_missing:
    df[column] = df[column].fillna('Missing')

# Drop Columns

**accountNumber and customerId are 1:1 match. We can drop one.**

In [8]:
df = df.drop(["accountNumber"],axis=1)

**generate a matchCVV when Card CVV = entered CVV**

In [9]:
df["matchCVV"] = df.cardCVV == df.enteredCVV

In [10]:
df = df.drop(["cardCVV", "enteredCVV"],axis=1)

# Add New Features

To prevent data leakage, we use solely historical data to build relevant features, particularly those related to fraud history.

# Time related Features

## Age of account

In [11]:
# Calculate the difference in days between transactionDateTime and accountOpenDate
df["accountAge"] = (df.transactionDateTime - df.accountOpenDate).dt.days

## Time since address was changed

In [12]:
# Calculate the difference in days between transactionDateTime and dateOfLastAddressChange
df["sinceDateOfLastAddressChange"] = (
    df.transactionDateTime - df.dateOfLastAddressChange
).dt.days

## Months between currentExpDate and transactionDateTime

In [13]:
# Calculate the difference in months between currentExpDate and transactionDateTime
df['monthsToExpiration'] = ((df['currentExpDate'].dt.year - df['transactionDateTime'].dt.year) * 12 + 
                            (df['currentExpDate'].dt.month - df['transactionDateTime'].dt.month))

## General date properties

In [14]:
# Extract useful time-related features
df['transactionMonth'] = df['transactionDateTime'].dt.month
df['transactionDay'] = df['transactionDateTime'].dt.day
df['transactionHour'] = df['transactionDateTime'].dt.hour
df['transactionWeekday'] = df['transactionDateTime'].dt.weekday

## Morning, afternoon, evening, night

In [15]:
# morning, afternoon, evening, night
def time_of_day(hour):
    if 6 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 18:
        return "Afternoon"
    elif 18 <= hour < 24:
        return "Evening"
    else:
        return "Night"

df['time_of_day'] = df['transactionHour'].apply(time_of_day)



## Peak Hours

In [16]:
# Modify the 'is_peak_hour' calculation to return True or False
df['is_peak_hour'] = df['transactionHour'].apply(lambda x: True if 9 <= x <= 18 else False)


## Weekday or Weekend

In [17]:
# Flag weekend transactions (5=Saturday, 6=Sunday)
df['is_weekend'] = df['transactionWeekday'].apply(lambda x: True if x >= 5 else False)


## Time Since Last Transaction

In [18]:
# Calculate the time difference between consecutive transactions for the same
df = df.sort_values(by=['customerId', 'transactionDateTime'])
df['transaction_time_diff'] = df.groupby('customerId')['transactionDateTime'].diff().dt.total_seconds()

# Replace missing values in 'transaction_rolling_diff' with the mean of the customer's rolling diff
df['transaction_time_diff'] = df.groupby('customerId')['transaction_time_diff'].transform(
    lambda x: x.fillna(x.mean())
)

df['transaction_time_diff'] = df['transaction_time_diff'].transform(
    lambda x: x.fillna(x.mean())
)


## Minimum time (minutes) between transactions in a 30 day span

In [19]:
df["min_time_diff_month"] = (
    df.groupby("customerId")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transaction_time_diff"].min())
    .reset_index("customerId")
    .drop("customerId", axis=1)
    .sort_index()
    .squeeze()
)

# Customer related Features

## Historical Average fraud of customer(moving average)

By utilizing the shift() function, the code ensures that the current transaction's "isFraud" value is excluded from the rolling average calculation. 

In [20]:
df["avgFraud"] = (
    df.groupby("customerId")["isFraud"]
    .apply(lambda x: x.shift().expanding().mean())
    .reset_index(level=0, drop=True)  # Reset the index to match the DataFrame
    .fillna(0)  # Replace NaN values with 0
)


## Fraud History Flag

In [21]:
# Create a flag that marks all records after the first fraud occurrence for each customer as True
df['customer_fraud_flag'] = df.groupby('customerId')['isFraud'].transform(lambda x: x.cumsum().shift(fill_value=0) > 0)


## Customer Transaction Amount Features

In [22]:
# Normalize the transactionAmount by the creditLimit or availableMoney to understand its relative significance.
df['relative_amount'] = df['transactionAmount'] / df['creditLimit']


## Customer Transaction Statistics Engineering( moving average over 1 month)

In [23]:
# Sort the data by customerId and transactionDateTime
df = df.sort_values(['customerId', 'transactionDateTime'])

# Calculate rolling min, max, mean, and std over a 30-day window
df["rolling_min_month"] = (
    df.groupby("customerId")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].min())
    .reset_index("customerId")
    .drop("customerId", axis=1)
    .sort_index()
    .squeeze()
)

df["rolling_max_month"] = (
    df.groupby("customerId")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].max())
    .reset_index("customerId")
    .drop("customerId", axis=1)
    .sort_index()
    .squeeze()
)

df["rolling_mean_month"] = (
    df.groupby("customerId")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].mean())
    .reset_index("customerId")
    .drop("customerId", axis=1)
    .sort_index()
    .squeeze()
)

df["rolling_std_month"] = (
    df.groupby("customerId")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].std())
    .reset_index("customerId")
    .drop("customerId", axis=1)
    .sort_index()
    .squeeze()
)

# Calculate the rolling difference (diff)
df['transaction_rolling_diff'] = df.groupby('customerId')['transactionAmount'].diff()

# Replace missing values in 'transaction_expanding_std' with the mean of the customer's expanding std
df['rolling_std_month'] = df.groupby('customerId')['rolling_std_month'].transform(
    lambda x: x.fillna(x.mean())
)

# Replace missing values in 'transaction_expanding_std' with the mean of the customer's expanding std
df['rolling_std_month'] = df['rolling_std_month'].transform(
    lambda x: x.fillna(x.mean())
)

# Replace missing values in 'transaction_rolling_diff' with the mean of the customer's rolling diff
df['transaction_rolling_diff'] = df.groupby('customerId')['transaction_rolling_diff'].transform(
    lambda x: x.fillna(x.mean())
)

df['transaction_rolling_diff'] = df['transaction_rolling_diff'].transform(
    lambda x: x.fillna(x.mean())
)

In [24]:
# Verify the new features
df[['customerId', 'transactionDateTime', 'transactionAmount', 'rolling_min_month', 
    'rolling_max_month', 'rolling_mean_month', 'rolling_std_month', 'transaction_rolling_diff']].head()

Unnamed: 0,customerId,transactionDateTime,transactionAmount,rolling_min_month,rolling_max_month,rolling_mean_month,rolling_std_month,transaction_rolling_diff
541900,100088067,2016-01-12 00:59:52,205.13,205.13,205.13,205.13,103.54883,-2.745775
541901,100088067,2016-01-12 19:49:41,46.43,46.43,205.13,125.78,112.217846,-158.7
541902,100088067,2016-01-17 15:41:46,378.67,46.43,378.67,210.076667,166.175228,332.24
541903,100088067,2016-02-02 19:57:18,66.07,46.43,378.67,174.075,153.603226,-312.6
541904,100088067,2016-02-03 04:48:54,141.6,46.43,378.67,167.58,133.814754,75.53


## Customer Transaction Outliers


In [25]:
#Flag transactions with amounts far from the customer’s usual range.
df['is_outlier_customer'] = (df['transactionAmount'] < (df['rolling_mean_month'] - 3 * df['rolling_std_month'])) | \
                   (df['transactionAmount'] > (df['rolling_mean_month'] + 3 * df['rolling_std_month']))


# Merchant related Features

## Merchant Statistics(moving average over 1 month)

In [26]:
# Sort the data by merchantName and transactionDateTime
df = df.sort_values(['merchantName', 'transactionDateTime'])

# Calculate rolling min, max, mean, and std over a 30-day window for each merchant
df["merchant_rolling_min_month"] = (
    df.groupby("merchantName")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].min())
    .reset_index("merchantName")
    .drop("merchantName", axis=1)
    .sort_index()
    .squeeze()
)

df["merchant_rolling_max_month"] = (
    df.groupby("merchantName")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].max())
    .reset_index("merchantName")
    .drop("merchantName", axis=1)
    .sort_index()
    .squeeze()
)

df["merchant_rolling_mean_month"] = (
    df.groupby("merchantName")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].mean())
    .reset_index("merchantName")
    .drop("merchantName", axis=1)
    .sort_index()
    .squeeze()
)

df["merchant_rolling_std_month"] = (
    df.groupby("merchantName")
    .apply(lambda x: x.rolling("30D", on="transactionDateTime")["transactionAmount"].std())
    .reset_index("merchantName")
    .drop("merchantName", axis=1)
    .sort_index()
    .squeeze()
)

# Calculate the rolling difference (diff) for each merchant
df['merchant_transaction_rolling_diff'] = df.groupby('merchantName')['transactionAmount'].diff()

# Replace missing values in rolling std and rolling diff for merchants
df['merchant_rolling_std_month'] = df.groupby('merchantName')['merchant_rolling_std_month'].transform(
    lambda x: x.fillna(x.mean())
)

df['merchant_rolling_std_month'] = df['merchant_rolling_std_month'].transform(
    lambda x: x.fillna(x.mean())
)

df['merchant_transaction_rolling_diff'] = df.groupby('merchantName')['merchant_transaction_rolling_diff'].transform(
    lambda x: x.fillna(x.mean())
)

df['merchant_transaction_rolling_diff'] = df['merchant_transaction_rolling_diff'].transform(
    lambda x: x.fillna(x.mean())
)

# Verify the new features
df[['merchantName', 'transactionDateTime', 'transactionAmount', 'merchant_rolling_min_month', 
    'merchant_rolling_max_month', 'merchant_rolling_mean_month', 'merchant_rolling_std_month', 
    'merchant_transaction_rolling_diff']].head()


Unnamed: 0,merchantName,transactionDateTime,transactionAmount,merchant_rolling_min_month,merchant_rolling_max_month,merchant_rolling_mean_month,merchant_rolling_std_month,merchant_transaction_rolling_diff
530787,1st BBQ,2016-01-01 01:56:16,78.91,78.91,78.91,78.91,150.270168,0.210786
182990,1st BBQ,2016-01-01 03:50:57,378.92,78.91,378.92,228.915,212.139105,300.01
479512,1st BBQ,2016-01-01 18:46:43,2.42,2.42,378.92,153.416667,199.001244,-376.5
416551,1st BBQ,2016-01-01 19:23:12,134.84,2.42,378.92,148.7725,162.749101,132.42
297574,1st BBQ,2016-01-01 22:23:06,128.21,2.42,378.92,144.66,141.244524,-6.63


## Merchant outlier transaction 

In [27]:
#Flag transactions with amounts far from the Merchant’s usual range.
df['is_outlier_merchant'] = (df['transactionAmount'] < (df['merchant_rolling_mean_month'] - 3 * df['merchant_rolling_std_month'])) | \
                   (df['transactionAmount'] > (df['merchant_rolling_mean_month'] + 3 * df['merchant_rolling_std_month']))

## Merchant Fraud(moving average)

By utilizing the shift() function, the code ensures that the current transaction's "isFraud" value is excluded from the rolling average calculation. 

In [28]:
# Calculate the average fraud rate (avgFraud) for each customerId
df["MerchantavgFraud"] = (
    df.groupby("merchantName")["isFraud"]
    .apply(lambda x: x.shift().expanding().mean())
    .reset_index(level=0, drop=True)  # Reset the index to match the original DataFrame
    .fillna(0)  # Replace NaN values with 0
)

## Merchant Popularity

In [29]:
#Count the number of transactions per merchant.
df['merchant_transaction_count'] = df.groupby('merchantName')['transactionAmount'].transform('count')

## Merchant Diversity

In [30]:
#Count the number of unique merchants a customer interacts with.

df['unique_merchants'] = df.groupby('customerId')['merchantName'].transform('nunique')

# Other Features

## Customer-Merchant Pair Frequency

In [31]:
# Count the number of interactions between each customer and merchant.
df['customer_merchant_count'] = df.groupby(['customerId', 'merchantName'])['transactionAmount'].transform('count')


## Rare Interaction


In [32]:
#Flag transactions with merchants that a customer rarely interacts with.
df['rare_interaction'] = df['customer_merchant_count'] < 2

## Missing value feature

**I believe that the missing values in acqCountry, posEntryMode, merchantCountryCode, transactionType, and posConditionCode might contain information related to fraud.**

In [33]:
# Create a new feature to flag if any of the specified columns have the value "Missing"
columns_to_check = ['acqCountry', 'posEntryMode', 'merchantCountryCode', 'transactionType', 'posConditionCode']

# Create a new feature to flag if any of the specified columns have the value "Missing" as boolean
df['is_missing_flag'] = df[columns_to_check].apply(lambda row: any(value == "Missing" for value in row), axis=1)


In [34]:
df

Unnamed: 0,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,posConditionCode,merchantCategoryCode,currentExpDate,accountOpenDate,dateOfLastAddressChange,cardLast4Digits,transactionType,currentBalance,cardPresent,expirationDateKeyInMatch,isFraud,matchCVV,accountAge,sinceDateOfLastAddressChange,monthsToExpiration,transactionMonth,transactionDay,transactionHour,transactionWeekday,time_of_day,is_peak_hour,is_weekend,transaction_time_diff,min_time_diff_month,avgFraud,customer_fraud_flag,relative_amount,rolling_min_month,rolling_max_month,rolling_mean_month,rolling_std_month,transaction_rolling_diff,is_outlier_customer,merchant_rolling_min_month,merchant_rolling_max_month,merchant_rolling_mean_month,merchant_rolling_std_month,merchant_transaction_rolling_diff,is_outlier_merchant,MerchantavgFraud,merchant_transaction_count,unique_merchants,customer_merchant_count,rare_interaction,is_missing_flag
530787,332699562,7500,7500.00,2016-01-01 01:56:16,78.91,1st BBQ,US,US,09,01,food,2026-09-01,2015-05-14,2015-05-14,3271,PURCHASE,0.00,True,False,False,True,232,232,128,1,1,1,4,Night,False,False,563305.800000,563305.800000,0.000000,False,0.010521,78.91,78.91,78.910000,123.686859,-0.041455,False,78.91,78.91,78.910000,150.270168,0.210786,False,0.000000,777,29,1,True,False
182990,248663206,5000,5000.00,2016-01-01 03:50:57,378.92,1st BBQ,US,US,05,08,food,2026-08-01,2013-05-27,2013-05-27,5485,PURCHASE,0.00,True,False,False,True,949,949,127,1,1,3,4,Night,False,False,15786.712425,15786.712425,0.000000,False,0.075784,378.92,378.92,378.920000,141.922028,-0.137039,False,78.91,378.92,228.915000,212.139105,300.010000,False,0.000000,777,475,10,False,False
479512,302778428,10000,10000.00,2016-01-01 18:46:43,2.42,1st BBQ,US,US,05,08,food,2028-11-01,2013-10-08,2013-10-08,7370,PURCHASE,0.00,True,False,False,True,815,815,154,1,1,18,4,Evening,True,False,587404.886792,587404.886792,0.000000,False,0.000242,2.42,2.42,2.420000,132.489609,0.665094,False,2.42,378.92,153.416667,199.001244,-376.500000,False,0.000000,777,47,2,False,False
416551,568638109,50000,50000.00,2016-01-01 19:23:12,134.84,1st BBQ,US,US,05,08,food,2022-03-01,2014-12-22,2014-12-22,4685,PURCHASE,0.00,True,False,False,True,375,375,74,1,1,19,4,Evening,False,False,617382.380000,617382.380000,0.000000,False,0.002697,134.84,134.84,134.840000,116.196776,-1.201000,False,2.42,378.92,148.772500,162.749101,132.420000,False,0.000000,777,40,1,True,False
297574,747568560,10000,9985.71,2016-01-01 22:23:06,128.21,1st BBQ,US,US,02,01,food,2030-06-01,2015-07-08,2015-07-08,1300,PURCHASE,14.29,True,False,False,True,177,177,173,1,1,22,4,Evening,False,False,6278.000000,6278.000000,0.000000,False,0.012821,14.29,128.21,71.250000,80.553605,113.920000,False,2.42,378.92,144.660000,141.244524,-6.630000,False,0.000000,777,192,1,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778053,198092241,7500,2811.33,2016-12-26 21:55:07,5.80,williamssonoma.com,US,US,05,01,furniture,2029-10-01,2014-12-14,2016-12-02,3156,PURCHASE,4688.67,True,False,False,True,743,24,154,12,26,21,0,Evening,False,False,40382.000000,84.000000,0.012188,True,0.000773,0.00,806.32,122.153974,126.488999,-59.070000,False,0.00,639.92,131.321892,148.001549,-35.690000,False,0.004255,475,52,126,False,False
608159,599847715,2500,1000.34,2016-12-27 02:16:57,0.00,williamssonoma.com,US,US,80,01,furniture,2023-06-01,2014-04-03,2016-12-21,9104,ADDRESS_VERIFICATION,1499.66,False,False,False,True,999,6,78,12,27,2,1,Night,False,False,21783.000000,53.000000,0.010343,True,0.000000,0.00,1470.26,164.640935,199.883869,-723.030000,False,0.00,639.92,127.866053,147.533974,-5.800000,False,0.004246,475,71,52,False,False
608162,599847715,2500,844.56,2016-12-27 06:48:55,27.41,williamssonoma.com,US,US,02,01,furniture,2020-11-01,2014-04-03,2016-12-21,9104,PURCHASE,1655.44,True,False,False,True,999,6,47,12,27,6,1,Morning,False,False,6001.000000,53.000000,0.010323,True,0.010964,0.00,1470.26,163.061560,198.969898,-11.170000,False,0.00,639.92,118.812632,142.658222,27.410000,False,0.004237,475,71,52,False,False
13223,393869032,15000,6188.68,2016-12-27 19:21:15,67.62,williamssonoma.com,US,US,05,08,furniture,2032-10-01,2011-05-14,2016-04-18,3261,PURCHASE,8811.32,True,False,False,True,2054,253,190,12,27,19,1,Evening,False,False,181978.000000,2689.000000,0.021277,True,0.004508,5.17,222.01,88.400000,79.579252,-64.570000,False,0.00,639.92,119.615676,143.401419,40.210000,False,0.004228,475,39,9,False,False


In [35]:
df.isnull().sum()

customerId                           0
creditLimit                          0
availableMoney                       0
transactionDateTime                  0
transactionAmount                    0
merchantName                         0
acqCountry                           0
merchantCountryCode                  0
posEntryMode                         0
posConditionCode                     0
merchantCategoryCode                 0
currentExpDate                       0
accountOpenDate                      0
dateOfLastAddressChange              0
cardLast4Digits                      0
transactionType                      0
currentBalance                       0
cardPresent                          0
expirationDateKeyInMatch             0
isFraud                              0
matchCVV                             0
accountAge                           0
sinceDateOfLastAddressChange         0
monthsToExpiration                   0
transactionMonth                     0
transactionDay           

# Saving

In [36]:
print(df.info())  # Display DataFrame info

<class 'pandas.core.frame.DataFrame'>
Index: 740001 entries, 530787 to 750392
Data columns (total 54 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   customerId                         740001 non-null  int64         
 1   creditLimit                        740001 non-null  int64         
 2   availableMoney                     740001 non-null  float64       
 3   transactionDateTime                740001 non-null  datetime64[ns]
 4   transactionAmount                  740001 non-null  float64       
 5   merchantName                       740001 non-null  object        
 6   acqCountry                         740001 non-null  object        
 7   merchantCountryCode                740001 non-null  object        
 8   posEntryMode                       740001 non-null  object        
 9   posConditionCode                   740001 non-null  object        
 10  merchantCategoryCode

In [37]:
# Saves types to load quickly
df.dtypes.to_csv("transactions_features_dtypes.csv")
# Save data
df.to_parquet('transactions_features.parquet', engine='pyarrow')