In [2]:
import sys
sys.path.append('../scripts')

from scripts.load_data import load_data
from scripts.feature_engineering import create_aggregate_features, extract_temporal_features, encode_categorical, handle_missing_values, scale_features, calculate_woe_iv

# Load the data
data = load_data('../data/data.csv')

# Step 1: Create aggregate features
agg_features = create_aggregate_features(data)
print("Aggregated features:")
print(agg_features.head())

# Step 2: Extract temporal features
data = extract_temporal_features(data)
print("Data with extracted temporal features:")
print(data[['TransactionStartTime', 'transaction_hour', 'transaction_day']].head())

# Step 3: Ensure the necessary columns are categorical before WoE calculation
print(f"Before conversion: ProductCategory type: {data['ProductCategory'].dtype}, ChannelId type: {data['ChannelId'].dtype}")

# Convert columns to categorical
data['ProductCategory'] = data['ProductCategory'].astype('category')
data['ChannelId'] = data['ChannelId'].astype('category')

print(f"After conversion: ProductCategory type: {data['ProductCategory'].dtype}, ChannelId type: {data['ChannelId'].dtype}")

# Step 4: Calculate WoE and IV for original categorical variables
categorical_columns = ['ProductCategory', 'ChannelId']
woe_features, iv_values = calculate_woe_iv(data, 'FraudResult', categorical_columns)
print("WoE-transformed data:")
print(woe_features.head())

print("Information Value of features:")
print(iv_values)

# Step 5: Encode remaining categorical variables (if any)
data_encoded = encode_categorical(woe_features, [], method='one-hot')  # Skip encoding for WoE transformed columns
print("Encoded categorical variables:")
print(data_encoded.head())

# Step 6: Handle missing values
data_filled = handle_missing_values(data_encoded, strategy='mean')
print("Data after handling missing values:")
print(data_filled.head())

# Step 7: Normalize/Standardize numerical features
numerical_columns = ['Amount', 'Value']
data_normalized = scale_features(data_filled, numerical_columns, method='normalize')
print("Normalized numerical features:")
print(data_normalized[['Amount', 'Value']].head())


Data loaded successfully. Shape: (95662, 16)
Aggregated features:
        CustomerId  total_amount    avg_amount  transaction_count  \
0     CustomerId_1      -10000.0 -10000.000000                  1   
1    CustomerId_10      -10000.0 -10000.000000                  1   
2  CustomerId_1001       20000.0   4000.000000                  5   
3  CustomerId_1002        4225.0    384.090909                 11   
4  CustomerId_1003       20000.0   3333.333333                  6   

   std_transaction_amount  
0                0.000000  
1                0.000000  
2             6558.963333  
3              560.498966  
4             6030.478146  
Data with extracted temporal features:
       TransactionStartTime  transaction_hour  transaction_day
0 2018-11-15 02:18:49+00:00                 2               15
1 2018-11-15 02:19:08+00:00                 2               15
2 2018-11-15 02:44:21+00:00                 2               15
3 2018-11-15 03:32:55+00:00                 3               

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  agg_features['std_transaction_amount'].fillna(0, inplace=True)


After conversion: ProductCategory type: category, ChannelId type: category
Feature DataFrame shape: (95662, 2), types:
ProductCategory    category
ChannelId          category
dtype: object
Target Series shape: (95662,), type: int64


ValueError: Error during WoE/IV calculation: The input data must be pandas dataframe. But the input provided is <class 'str'>