In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from xverse.transformer import WOE

In [6]:
data = pd.read_csv('cleaned_data.csv')

In [7]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,Amount_log,Value_log
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0,6.908755,6.908755
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0,,3.044522
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0,6.216606,6.216606
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0,9.903538,9.989711
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0,,6.46925


In [8]:
data.isnull().sum()

Unnamed: 0,0
TransactionId,0
BatchId,0
AccountId,0
SubscriptionId,0
CustomerId,0
ProviderId,0
ProductId,0
ProductCategory,0
ChannelId,0
Amount,0


In [9]:
data['Is_Positive_Amount'] = data['Amount'] > 0
data['Amount_log'] = np.where(data['Amount'] > 0, np.log1p(data['Amount']), 0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
data.isnull().sum()

Unnamed: 0,0
TransactionId,0
BatchId,0
AccountId,0
SubscriptionId,0
CustomerId,0
ProviderId,0
ProductId,0
ProductCategory,0
ChannelId,0
Amount,0


In [11]:
# Aggregate Features
agg_features = data.groupby('CustomerId')['Amount'].agg([
    ('Total_Transaction_Amount', 'sum'),
    ('Avg_Transaction_Amount', 'mean'),
    ('Transaction_Count', 'count'),
    ('Std_Transaction_Amount', 'std')
]).reset_index()

In [12]:
data = data.merge(agg_features, on='CustomerId', how='left')

In [13]:
# Convert TransactionStartTime to datetime
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])

# Extract features
data['Transaction_Hour'] = data['TransactionStartTime'].dt.hour
data['Transaction_Day'] = data['TransactionStartTime'].dt.day
data['Transaction_Month'] = data['TransactionStartTime'].dt.month
data['Transaction_Year'] = data['TransactionStartTime'].dt.year

In [14]:
# Label Encoding for ordinal encoding
label_encoder = LabelEncoder()
categorical_cols = ['ProviderId', 'ProductCategory', 'ChannelId']
for col in categorical_cols:
    data[col + '_Encoded'] = label_encoder.fit_transform(data[col])

# One-Hot Encoding
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [15]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,ProductId,Amount,Value,TransactionStartTime,PricingStrategy,...,ProviderId_ProviderId_5,ProviderId_ProviderId_6,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,ProductId_10,1000.0,1000,2018-11-15 02:18:49+00:00,2,...,False,True,True,False,False,False,False,False,True,False
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,ProductId_6,-20.0,20,2018-11-15 02:19:08+00:00,2,...,False,False,False,False,True,False,False,True,False,False
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,ProductId_1,500.0,500,2018-11-15 02:44:21+00:00,2,...,False,True,True,False,False,False,False,False,True,False
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,ProductId_21,20000.0,21800,2018-11-15 03:32:55+00:00,2,...,False,False,False,False,False,False,True,False,True,False
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,ProductId_6,-644.0,644,2018-11-15 03:34:21+00:00,2,...,False,False,False,False,True,False,False,True,False,False


In [16]:
data.isnull().sum()

Unnamed: 0,0
TransactionId,0
BatchId,0
AccountId,0
SubscriptionId,0
CustomerId,0
ProductId,0
Amount,0
Value,0
TransactionStartTime,0
PricingStrategy,0


In [17]:
data['Std_Transaction_Amount'].fillna(data['Std_Transaction_Amount'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Std_Transaction_Amount'].fillna(data['Std_Transaction_Amount'].mean(), inplace=True)


In [19]:
scaler = StandardScaler()
numerical_cols = ['Total_Transaction_Amount', 'Avg_Transaction_Amount',
                  'Transaction_Count', 'Std_Transaction_Amount', 'Amount_log', 'Value_log']

# Standardize numerical columns
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])