In [28]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime

from Utils import *

In [29]:
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
df = pd.read_csv("input/training.csv")
# data = df.drop("FraudResult", axis=1)
data = df.copy()
y_train = df["FraudResult"]
X_test = pd.read_csv("input/test.csv")
X_valid = X_test.copy()
sample_submission = pd.read_csv("input/sample_submission.csv")

# Small preprocessing

In [30]:
# Count the number of unique entries in each column
unique_counts = data.nunique()

# Select only columns with more than one unique entry
drop_cols = unique_counts[unique_counts == 1].index.tolist()
print("Dropping column with one unique values: ", drop_cols)

# Drop the selected columns
data = data.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

# Get a list of column names that contain the string "id" in their name
id_cols = data.filter(like="Id").columns.tolist()

# Print the list of column names
print("Converting columns with Id: ", id_cols)

# Remove column name prefix and convert to integer data type
data[id_cols] = (
    data[id_cols]
    .astype(str)
    .apply(lambda x: x.str.replace(x.name + "_", ""))
    .astype(int)
)
X_test[id_cols] = (
    X_test[id_cols]
    .astype(str)
    .apply(lambda x: x.str.replace(x.name + "_", ""))
    .astype(int)
)

# convert TransactionStartTime column to datetime format
data["TransactionStartTime"] = pd.to_datetime(
    df["TransactionStartTime"], format="%Y-%m-%dT%H:%M:%SZ"
)
X_test["TransactionStartTime"] = pd.to_datetime(
    X_test["TransactionStartTime"], format="%Y-%m-%dT%H:%M:%SZ"
)

# extract date and time features
data["TransactionDayOfWeek"] = data["TransactionStartTime"].dt.dayofweek
data["TransactionDayOfMonth"] = data["TransactionStartTime"].dt.day
data["TransactionHour"] = data["TransactionStartTime"].dt.hour
data["TransactionMinute"] = data["TransactionStartTime"].dt.minute

X_test["TransactionDayOfWeek"] = X_test["TransactionStartTime"].dt.dayofweek
X_test["TransactionDayOfMonth"] = X_test["TransactionStartTime"].dt.day
X_test["TransactionHour"] = X_test["TransactionStartTime"].dt.hour
X_test["TransactionMinute"] = X_test["TransactionStartTime"].dt.minute

# drop TransactionStartTime
data = data.drop("TransactionStartTime", axis=1)
X_test = X_test.drop("TransactionStartTime", axis=1)

# Factorize the "ProductCategory" column
data['ProductCategory'] = pd.factorize(data['ProductCategory'])[0] + 1
X_test['ProductCategory'] = pd.factorize(X_test['ProductCategory'])[0] + 1
# Convert the "ProductCategory" column to integer data type
data['ProductCategory'] = data['ProductCategory'].astype(int)
X_test['ProductCategory'] = X_test['ProductCategory'].astype(int)

# Removing redundant data
data["Expense"] = data["Amount"] < 0
data.Expense = data.Expense.astype(int)
data = data.drop("Amount", axis=1)

X_test["Expense"] = X_test["Amount"] < 0
X_test.Expense = X_test.Expense.astype(int)
X_test = X_test.drop("Amount", axis=1)

# Continous value should be float
data["Value"] = data["Value"].astype(float)
X_test["Value"] = X_test["Value"].astype(float)

Dropping column with one unique values:  ['CurrencyCode', 'CountryCode']
Converting columns with Id:  ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'ProviderId', 'ProductId', 'ChannelId']


# Scenario 1: With All columns

In [31]:
# Evaluate data (drop FraudResult before)
data.drop("FraudResult", inplace=True, axis=1)
# mi_score = make_mi_scores(data, y_train)
train_X, val_X, train_y, val_y = train_test_split(data, y_train, random_state = 0)
referenceresult = comparemodels(train_X, train_y, val_X, X_test, "The reference")
referenceresult

NameError: name 'comparemodels' is not defined

# Scenario 2: Dropping Unique ID columns

In [5]:
dropping_cols = ["TransactionId", "BatchId", "AccountId", "CustomerId", "SubscriptionId"]

for col in dropping_cols:
    data = data.drop(col, axis=1)
    X_test = X_test.drop(col, axis=1)
    
# Evaluate data (drop FraudResult before)

# Scenario 3: Adding new features

## Analysing features that impact in Fraud

In [6]:
categorical_col = ["ProviderId", "ProductCategory", "ChannelId", "PricingStrategy"]
mean_std_col = ['ProductId', 'Value', 'Expense', "ProviderId", "ProductCategory", "ChannelId", "PricingStrategy"]

# Compute mean and standard deviation for specific features
for feature in mean_std_col:
    # Compute the mean and standard deviation of transactions for each feature
    feature_avg_values = data.groupby(feature)['Value'].mean()
    feature_std_values = data.groupby(feature)['Value'].std()
    data[f"{feature}_mean_amount"] = data[feature].apply(lambda x: feature_avg_values[x])
    data[f"{feature}_std_amount"] = data[feature].apply(lambda x: feature_std_values[x])
    
for feature in mean_std_col:
    # Compute the mean and standard deviation of transactions for each feature
    feature_avg_values = X_test.groupby(feature)['Value'].mean()
    feature_std_values = X_test.groupby(feature)['Value'].std()
    X_test[f"{feature}_mean_amount"] = X_test[feature].apply(lambda x: feature_avg_values[x])
    X_test[f"{feature}_std_amount"] = X_test[feature].apply(lambda x: feature_std_values[x])

In [7]:
# OneHot Encode categorical features
data = pd.get_dummies(data, columns=categorical_col)
# Adding missing col
data["ChannelId_4"] = False
data["PricingStrategy_3"] = False
X_test = pd.get_dummies(X_test, columns=categorical_col)
# Adding missing col
X_test["PricingStrategy_3"] = False

In [8]:
# Convert the boolean columns to integer columns
bool_cols = data.select_dtypes(include=bool).columns.tolist()
data[bool_cols] = data[bool_cols].astype(int)

bool_cols = X_test.select_dtypes(include=bool).columns.tolist()
X_test[bool_cols] = X_test[bool_cols].astype(int)

In [9]:
data.shape

(95662, 47)