In [17]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer

In [18]:
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
df = pd.read_csv("input/training.csv")
data = df.copy()
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

In [19]:
class CreateExpenseFeature(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X["Expense"] = X["Amount"] < 0
        return X.drop("Amount", axis=1)


class OneHotEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return pd.get_dummies(X, columns=self.columns)


class DropUniqueColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        unique_counts = X.nunique()
        self.drop_cols = unique_counts[unique_counts == 1].index.tolist()
        return self

    def transform(self, X, y=None):
        print("Dropping columns with one unique value:", self.drop_cols)
        return X.drop(columns=self.drop_cols)


class ConvertIdColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        id_cols = X.filter(like="Id").columns.tolist()
        X[id_cols] = (
            X[id_cols]
            .astype(str)
            .apply(lambda x: x.str.replace(x.name + "_", ""))
            .astype(int)
        )
        return X


class ExtractDateTimeFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X["TransactionStartTime"] = pd.to_datetime(
            X["TransactionStartTime"], format="%Y-%m-%dT%H:%M:%SZ"
        )
        X["TransactionDayOfWeek"] = X["TransactionStartTime"].dt.dayofweek
        X["TransactionDayOfMonth"] = X["TransactionStartTime"].dt.day
        X["TransactionHour"] = X["TransactionStartTime"].dt.hour
        X["TransactionMinute"] = X["TransactionStartTime"].dt.minute
        return X.drop("TransactionStartTime", axis=1)


class ComputeMeanStdAmount(BaseEstimator, TransformerMixin):
    def __init__(self, mean_amount_features, std_amount_features):
        self.mean_amount_features = mean_amount_features
        self.std_amount_features = std_amount_features

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for feature in self.mean_amount_features:
            X[f"{feature}_mean_amount"] = X.groupby(feature)["Value"].transform("mean")
        for feature in self.std_amount_features:
            X[f"{feature}_std_amount"] = X.groupby(feature)["Value"].transform("std")
        return X


class ColumnNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        self.scaler = StandardScaler()
        self.scaler.fit(X[[self.column]])
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X[self.column] = self.scaler.transform(X[[self.column]])
        return X

In [20]:
# Preprocessing steps
preprocessing_steps = [
    ("create_expense_feature", CreateExpenseFeature()),
    ("drop_unique_columns", DropUniqueColumns()),
    ("convert_id_columns", ConvertIdColumns()),
    ("extract_datetime_features", ExtractDateTimeFeatures()),
]

# Column transformations
normalize_columns = ColumnNormalizer("Value")

categorical_columns = ["ProductCategory", "ChannelId", "ProviderId", "PricingStrategy"]
one_hot_encoder = ColumnTransformer(
    [
        (
            "one_hot_encoder",
            OneHotEncoderWrapper(categorical_columns),
            categorical_columns,
        )
    ],
    remainder="passthrough",
)

mean_amount_features = [
    "AccountId",
    "SubscriptionId",
]
std_amount_features = [
    # The list of std_amount_features
]

# Compute mean and standard deviation of transactions for each feature
compute_mean_std_amount = ComputeMeanStdAmount(
    mean_amount_features=mean_amount_features,
    std_amount_features=std_amount_features,
)

# Combine preprocessing steps, column transformations, and feature computations
pipeline = Pipeline(
    steps=[
        ("preprocessing", Pipeline(preprocessing_steps)),
        ("normalize_columns", normalize_columns),
        ("one_hot_encoder", OneHotEncoderWrapper(categorical_columns)),
        ("compute_mean_std_amount", compute_mean_std_amount),
    ]
)

In [21]:
processed_data = pipeline.fit_transform(data)

# Get the transformed column names
transformed_columns = (
    data.columns.tolist()
    + pipeline.named_steps["one_hot_encoder"]
    .transform(pd.DataFrame(columns=categorical_columns))
    .columns.tolist()
    + pipeline.named_steps["compute_mean_std_amount"].mean_amount_features
    + pipeline.named_steps["compute_mean_std_amount"].std_amount_features
)

# Convert the result back to a DataFrame with updated column names
processed_data = pd.DataFrame(processed_data, columns=transformed_columns)

Dropping columns with one unique value: ['CurrencyCode', 'CountryCode']


In [23]:
processed_data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult', 'Expense',
       'AccountId', 'SubscriptionId'],
      dtype='object')