In [1]:

import pandas as pd
import os, sys

In [2]:
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [3]:
from feature_engineering import FeatureEngineering

In [4]:
data = pd.read_csv("../data/data.csv")

In [5]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [9]:
df_copy = data.copy().reset_index()
    
    # Identify columns to exclude and categorical columns to encode
cols_to_drop = ['ProductId', 'BatchId', 'AccountId', 'ProviderId', 'SubscriptionId', 
                    'Value', 'CountryCode', 'CurrencyCode']
cat_features = ['ProductCategory', 'ChannelId']

# Drop the identified columns
df_copy.drop(columns=cols_to_drop, inplace=True)

In [10]:
fe = FeatureEngineering()

In [11]:
aggregated_df = fe.create_aggregate_features(df_copy)

In [13]:
# Create transaction-based features
df_with_transaction_features = fe.create_transaction_features(aggregated_df)

In [14]:
# Extract time features
df_with_time_features = fe.extract_time_features(df_with_transaction_features)

In [15]:
# Encode categorical features
df_encoded = fe.encode_categorical_features(df_with_time_features, cat_features)

In [16]:
# Handle missing values
df_cleaned = fe.handle_missing_values(df_encoded)

In [18]:
 # Identify numerical columns to normalize, excluding specified columns like 'Amount' and 'FraudResult'
numeric_cols = df_cleaned.select_dtypes(include='number').columns.tolist()
exclude_cols = ['Amount', 'FraudResult']  # Add any other columns you wish to exclude from normalization
numeric_cols = [col for col in numeric_cols if col not in exclude_cols]

    # Normalize numerical features
df_normalized = fe.normalize_numerical_features(df_cleaned, numeric_cols, method='normalize')

In [21]:
df_normalized.head(10)

Unnamed: 0_level_0,index,CustomerId,Amount,TransactionStartTime,PricingStrategy,FraudResult,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,...,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5
TransactionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TransactionId_76871,0.0,CustomerId_4406,1000.0,2018-11-15 02:18:49+00:00,0.5,0.0,0.557522,0.047184,0.028851,0.000919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TransactionId_73770,1e-05,CustomerId_4406,-20.0,2018-11-15 02:19:08+00:00,0.5,0.0,0.557522,0.047184,0.028851,0.000919,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TransactionId_26203,2.1e-05,CustomerId_4683,500.0,2018-11-15 02:44:21+00:00,0.5,0.0,0.556944,0.047137,0.000244,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TransactionId_380,3.1e-05,CustomerId_988,20000.0,2018-11-15 03:32:55+00:00,0.5,0.0,0.558153,0.047749,0.009046,0.005187,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
TransactionId_28195,4.2e-05,CustomerId_988,-644.0,2018-11-15 03:34:21+00:00,0.5,0.0,0.558153,0.047749,0.009046,0.005187,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TransactionId_23223,5.2e-05,CustomerId_1432,2000.0,2018-11-15 03:35:10+00:00,0.5,0.0,0.556949,0.047303,0.0,0.005449,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TransactionId_118063,6.3e-05,CustomerId_2858,10000.0,2018-11-15 03:44:31+00:00,1.0,0.0,0.557434,0.047439,0.006846,0.00166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TransactionId_100640,7.3e-05,CustomerId_2858,-500.0,2018-11-15 03:45:13+00:00,0.5,0.0,0.557434,0.047439,0.006846,0.00166,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TransactionId_51905,8.4e-05,CustomerId_598,500.0,2018-11-15 04:14:59+00:00,0.5,0.0,0.556981,0.047305,0.000733,0.00101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
TransactionId_130161,9.4e-05,CustomerId_1053,600.0,2018-11-15 04:31:48+00:00,0.5,0.0,0.557011,0.0473,0.001467,0.000577,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
# Save extracted and cleaned features to csv
df_normalized.to_csv('../data/extracted_features.csv')