## import the Script 

In [1]:
import pandas as pd
import sys
from pathlib import Path
import numpy as np


sys.path.append(str(Path().resolve().parent / "src"))



In [2]:
df = pd.read_csv('/home/samrawit/credit-risk-model/data/raw/data.csv')


In [3]:
df.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')

In [4]:
df.shape

(95662, 16)

In [5]:
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

In [6]:
low_card_cat_cols = [col for col in cat_cols if df[col].nunique() < 100]

print("✅ Selected categorical features:", low_card_cat_cols)

✅ Selected categorical features: ['CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId']


In [7]:
import pandas as pd
from  main import main



In [9]:

from Feature import build_pipeline, FEATURES, TARGET
import importlib
import Feature
importlib.reload(Feature)

<module 'Feature' from '/home/samrawit/credit-risk-model/src/Feature.py'>

In [10]:
processed_data = main(df)  

Initial columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']
Engineered columns: ['CustomerId', 'Total_Transaction_Amount', 'Average_Transaction_Amount', 'Transaction_Count', 'Std_Transaction_Amount']
Final processed data shape: (3742, 4)


In [8]:
processed_df = pd.DataFrame(processed_data)

In [9]:
processed_df

Unnamed: 0,0,1,2,3
0,-0.066891,-0.153364,-0.253459,0.000000
1,-0.066891,-0.153364,-0.253459,0.000000
2,-0.055849,-0.069870,-0.212186,-0.105976
3,-0.061655,-0.091435,-0.150278,-0.168036
4,-0.055849,-0.073846,-0.201868,-0.111444
...,...,...,...,...
3737,-0.055849,-0.073846,-0.201868,-0.110846
3738,-0.055849,-0.069870,-0.212186,-0.104047
3739,0.136968,-0.061611,0.778355,-0.020708
3740,-0.012049,-0.044962,-0.088369,-0.127968


In [10]:
processed_df.to_csv("/home/samrawit/credit-risk-model/data/Processed/processed_data.csv", index=False)

In [11]:
CSV_PATH = "/home/samrawit/credit-risk-model/data/raw/data.csv"  
df = pd.read_csv(CSV_PATH)
df.head()
    

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [11]:
y = df[TARGET]
X = df.drop(columns=[TARGET])


In [12]:
pipe = build_pipeline()
X_ready = pipe.fit_transform(X, y)


In [13]:
from scipy import sparse

def get_feature_names_from_column_transformer(column_transformer):
    feature_names = []

    for name, transformer, columns in column_transformer.transformers_:
        if name == 'remainder' and transformer == 'drop':
            continue
        if hasattr(transformer, 'get_feature_names_out'):
            try:
                names = transformer.get_feature_names_out(columns)
            except:
                names = columns
            feature_names.extend(names)
        else:
            feature_names.extend(columns)
    return feature_names

preprocessor = pipe.named_steps['prep']
feature_names = get_feature_names_from_column_transformer(preprocessor)

if sparse.issparse(X_ready):
    df_ready = pd.DataFrame.sparse.from_spmatrix(X_ready, columns=feature_names)
else:
    df_ready = pd.DataFrame(X_ready, columns=feature_names)

print("✅ Shape after preprocessing:", X_ready.shape)
df_ready.head()

✅ Shape after preprocessing: (95662, 51)


Unnamed: 0,Amount_sum,Amount_mean,Amount_count,Amount_std,TransactionStartTime_hour,TransactionStartTime_day,TransactionStartTime_month,TransactionStartTime_year,CurrencyCode_UGX,ChannelId_ChannelId_1,...,ProductId_ProductId_9,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill
0,0.170118,-0.067623,-0.311831,-0.167524,-2.15553,-0.100739,0.848684,-0.994246,1.0,0,...,0,1.0,0,0.0,0,0,0,0,0,0.0
1,0.170118,-0.067623,-0.311831,-0.167524,-2.15553,-0.100739,0.848684,-0.994246,1.0,0,...,0,0.0,0,1.0,0,0,0,0,0,0.0
2,0.165122,-0.072568,-0.444993,-0.201719,-2.15553,-0.100739,0.848684,-0.994246,1.0,0,...,0,1.0,0,0.0,0,0,0,0,0,0.0
3,0.175567,-0.008155,-0.40402,-0.008737,-1.949214,-0.100739,0.848684,-0.994246,1.0,0,...,0,0.0,0,0.0,0,0,0,0,0,1.0
4,0.175567,-0.008155,-0.40402,-0.008737,-1.949214,-0.100739,0.848684,-0.994246,1.0,0,...,0,0.0,0,1.0,0,0,0,0,0,0.0


In [14]:
df_ready.to_csv("/home/samrawit/credit-risk-model/data/Processed/processed_NEWdata.csv", index=False)