## Data cleaning and preprocessing

In [19]:
import sys
import os
sys.path.append('../scripts')
from data_preprocessor import *

In [None]:
# Load dataset
file_path = '../data/raw/data.csv' 
raw_data = DataLoader.load_data(file_path)


# Check the first few rows of the dataset
raw_data.head(10)

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.0,2018-11-15T02:18:49Z,2.0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,,ProviderId_4,ProductId_6,financial_services,ChannelId_2,3679.0,20.0,2018-11-15T02:19:08Z,2.0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256.0,,ProductId_1,airtime,ChannelId_3,500.0,500.0,2018-11-15T02:44:21Z,2.0,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256.0,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,,2018-11-15T03:32:55Z,2.0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644.0,2018-11-15T03:34:21Z,2.0,0
5,TransactionId_23223,BatchId_25954,AccountId_1078,SubscriptionId_4238,CustomerId_1432,UGX,256.0,ProviderId_6,ProductId_3,airtime,ChannelId_3,3679.0,,2018-11-15T03:35:10Z,2.0,0
6,TransactionId_118063,BatchId_118460,AccountId_2442,SubscriptionId_1980,CustomerId_2858,UGX,256.0,ProviderId_5,ProductId_3,airtime,ChannelId_3,10000.0,10000.0,2018-11-15T03:44:31Z,2.0,0
7,TransactionId_100640,BatchId_38561,AccountId_4841,SubscriptionId_3829,CustomerId_2858,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-500.0,,2018-11-15T03:45:13Z,2.0,0
8,TransactionId_51905,BatchId_93774,AccountId_272,SubscriptionId_4731,CustomerId_598,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,3679.0,500.0,2018-11-15T04:14:59Z,2.0,0
9,TransactionId_130161,BatchId_82409,AccountId_710,SubscriptionId_920,CustomerId_1053,UGX,256.0,ProviderId_1,ProductId_15,financial_services,ChannelId_3,600.0,600.0,2018-11-15T04:31:48Z,2.0,0


In [21]:
raw_data.shape

(95662, 16)

In [22]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  object 
 1   BatchId               95662 non-null  object 
 2   AccountId             95660 non-null  object 
 3   SubscriptionId        95662 non-null  object 
 4   CustomerId            95662 non-null  object 
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95660 non-null  float64
 7   ProviderId            95660 non-null  object 
 8   ProductId             95662 non-null  object 
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  object 
 11  Amount                95662 non-null  float64
 12  Value                 95659 non-null  float64
 13  TransactionStartTime  95661 non-null  object 
 14  PricingStrategy       95659 non-null  float64
 15  FraudResult        

In [23]:
raw_data['ChannelId'].unique()

array(['ChannelId_3', 'ChannelId_2', 'ChannelId_1', 'ChannelId_5'],
      dtype=object)

## Use data cleaning pipeline

In [24]:
cleaner = DataCleaner(raw_data)

In [25]:
categorical_vars, numerical_vars = cleaner.identify_variable_types()
print("Categorical Variables:", categorical_vars)
print("Numerical Variables:", numerical_vars)


Categorical Variables: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'TransactionStartTime']
Numerical Variables: ['CountryCode', 'Amount', 'Value', 'PricingStrategy', 'FraudResult']


In [26]:
df = cleaner.fill_missing_values(categorical_vars, numerical_vars)


In [12]:
#df = cleaner.remove_constant_columns()


In [27]:
date_columns = ["TransactionStartTime"]  # Modify based on your dataset
df = cleaner.convert_to_datetime(date_columns)


In [28]:
df = cleaner.handle_outliers(numerical_vars)


In [29]:
df.isnull().sum() # Should show 0 or minimal missing values


TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   TransactionId         95662 non-null  object             
 1   BatchId               95662 non-null  object             
 2   AccountId             95662 non-null  object             
 3   SubscriptionId        95662 non-null  object             
 4   CustomerId            95662 non-null  object             
 5   CurrencyCode          95662 non-null  object             
 6   CountryCode           95662 non-null  float64            
 7   ProviderId            95662 non-null  object             
 8   ProductId             95662 non-null  object             
 9   ProductCategory       95662 non-null  object             
 10  ChannelId             95662 non-null  object             
 11  Amount                95662 non-null  float64            
 12  Valu

In [31]:
df = DataLoader.load_data("../data/raw/data.csv")  # Load data

if df is not None:  # Ensure data is loaded before saving
    DataLoader.save_data(df, "../data/processed", "cleaned_df.parquet")  # Save data


Dataset saved successfully at: ../data/processed\cleaned_df.parquet
