In [1]:
%load_ext kedro.ipython

In [2]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler

In [3]:
catalog.list()


[1m[[0m
    [32m'fraud_train_raw'[0m,
    [32m'fraud_test_raw'[0m,
    [32m'raw_merged'[0m,
    [32m'raw_merged_prepared'[0m,
    [32m'fraud_train_unprocessed'[0m,
    [32m'fraud_test_unprocessed'[0m,
    [32m'fraud_val_unprocessed'[0m,
    [32m'fraud_train_processed'[0m,
    [32m'fraud_test_processed'[0m,
    [32m'fraud_val_processed'[0m,
    [32m'ml_model'[0m,
    [32m'classification_report'[0m,
    [32m'parameters'[0m,
    [32m'params:predictor_cols'[0m,
    [32m'params:target_col'[0m,
    [32m'params:top_categories'[0m,
    [32m'params:top_categories.cities'[0m,
    [32m'params:top_categories.states'[0m,
    [32m'params:test_size'[0m,
    [32m'params:undersampling'[0m,
    [32m'params:undersampling.desired_proportion'[0m,
    [32m'params:undersampling.total_samples'[0m,
    [32m'params:preprocess_features'[0m,
    [32m'params:preprocess_features.numeric_features'[0m,
    [32m'params:preprocess_features.numeric_features.standard'[

In [4]:
predictor_cols = catalog.load("params:predictor_cols")
predictor_cols

[1m[[0m[32m'trans_date_trans_time'[0m, [32m'category'[0m, [32m'amt'[0m, [32m'city'[0m, [32m'state'[0m, [32m'dob'[0m, [32m'is_fraud'[0m[1m][0m

In [5]:
train_df = catalog.load("fraud_train_raw")

# Select only columns in `predictor_cols`
train_df = train_df[predictor_cols]
train_df

Unnamed: 0,trans_date_trans_time,category,amt,city,state,dob,is_fraud
0,2019-01-01 00:00:18,misc_net,4.97,Moravian Falls,NC,1988-03-09,0
1,2019-01-01 00:00:44,grocery_pos,107.23,Orient,WA,1978-06-21,0
2,2019-01-01 00:00:51,entertainment,220.11,Malad City,ID,1962-01-19,0
3,2019-01-01 00:01:16,gas_transport,45.00,Boulder,MT,1967-01-12,0
4,2019-01-01 00:03:06,misc_pos,41.96,Doe Hill,VA,1986-03-28,0
...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,entertainment,15.56,Hatch,UT,1961-11-24,0
1296671,2020-06-21 12:12:19,food_dining,51.70,Tuscarora,MD,1979-12-11,0
1296672,2020-06-21 12:12:32,food_dining,105.93,High Rolls Mountain Park,NM,1967-08-30,0
1296673,2020-06-21 12:13:36,food_dining,74.90,Manderson,SD,1980-08-18,0


In [6]:
train_df.dtypes


trans_date_trans_time     object
category                  object
amt                      float64
city                      object
state                     object
dob                       object
is_fraud                   int64
dtype: object

In [7]:
test_df = catalog.load("fraud_test_raw")

# Select only columns in `predictor_cols`
test_df = test_df[predictor_cols]
test_df

Unnamed: 0,trans_date_trans_time,category,amt,city,state,dob,is_fraud
0,2020-06-21 12:14:25,personal_care,2.86,Columbia,SC,1968-03-19,0
1,2020-06-21 12:14:33,personal_care,29.84,Altonah,UT,1990-01-17,0
2,2020-06-21 12:14:53,health_fitness,41.28,Bellmore,NY,1970-10-21,0
3,2020-06-21 12:15:15,misc_pos,60.05,Titusville,FL,1987-07-25,0
4,2020-06-21 12:15:17,travel,3.19,Falmouth,MI,1955-07-06,0
...,...,...,...,...,...,...,...
555714,2020-12-31 23:59:07,health_fitness,43.77,Luray,MO,1966-02-13,0
555715,2020-12-31 23:59:09,kids_pets,111.84,Lake Jackson,TX,1999-12-27,0
555716,2020-12-31 23:59:15,kids_pets,86.88,Burbank,WA,1981-11-29,0
555717,2020-12-31 23:59:24,travel,7.99,Mesa,ID,1965-12-15,0


In [8]:
test_df.dtypes


trans_date_trans_time     object
category                  object
amt                      float64
city                      object
state                     object
dob                       object
is_fraud                   int64
dtype: object

In [9]:
train_df["dataset"] = "train"
test_df["dataset"] = "test"

raw_df = pd.concat([train_df, test_df], axis=0)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print()
print(f"Combined shape: {raw_df.shape}")

Train shape: (1296675, 8)
Test shape: (555719, 8)

Combined shape: (1852394, 8)


## Preprocess

In [10]:
raw_df["trans_date_trans_time"] = pd.to_datetime(raw_df["trans_date_trans_time"])
raw_df["dob"] = pd.to_datetime(raw_df["dob"])

# Day of week
# days = {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}
# raw_df["dayofweek"] = raw_df["trans_date_trans_time"].dt.dayofweek.map(days)
raw_df["dayofweek"] = raw_df["trans_date_trans_time"].dt.dayofweek
# Hour
raw_df["hour"] = raw_df["trans_date_trans_time"].dt.hour
# Age
raw_df["age"] = raw_df["trans_date_trans_time"].dt.year - raw_df["dob"].dt.year

raw_df.head()

Unnamed: 0,trans_date_trans_time,category,amt,city,state,dob,is_fraud,dataset,dayofweek,hour,age
0,2019-01-01 00:00:18,misc_net,4.97,Moravian Falls,NC,1988-03-09,0,train,1,0,31
1,2019-01-01 00:00:44,grocery_pos,107.23,Orient,WA,1978-06-21,0,train,1,0,41
2,2019-01-01 00:00:51,entertainment,220.11,Malad City,ID,1962-01-19,0,train,1,0,57
3,2019-01-01 00:01:16,gas_transport,45.0,Boulder,MT,1967-01-12,0,train,1,0,52
4,2019-01-01 00:03:06,misc_pos,41.96,Doe Hill,VA,1986-03-28,0,train,1,0,33


### `category`

In [11]:
train_df["category"].nunique()

[1;36m14[0m

### `city`

In [12]:
train_df["city"].nunique()

[1;36m894[0m

In [13]:
city_percentile = 99

In [14]:
city_threshold = np.percentile(train_df["city"].value_counts(), q=city_percentile)
print(city_threshold)
(train_df["city"].value_counts() >= city_threshold).sum()

4198.169999999973


[1;36m9[0m

In [15]:
cities = (
    train_df["city"]
    .value_counts()[train_df["city"].value_counts() >= city_threshold]
    .index.tolist()
)
cities


[1m[[0m
    [32m'Birmingham'[0m,
    [32m'San Antonio'[0m,
    [32m'Utica'[0m,
    [32m'Phoenix'[0m,
    [32m'Meridian'[0m,
    [32m'Thomas'[0m,
    [32m'Conway'[0m,
    [32m'Cleveland'[0m,
    [32m'Warren'[0m
[1m][0m

In [16]:
top_cities = catalog.load("params:top_categories.cities")

assert cities == top_cities

### Update `city` column in combined raw data

In [17]:
raw_df["city"] = raw_df["city"].apply(lambda x: "Other" if x not in cities else x)
raw_df["city"].value_counts()


city
Other          [1;36m1788846[0m
Birmingham        [1;36m8040[0m
San Antonio       [1;36m7312[0m
Utica             [1;36m7309[0m
Phoenix           [1;36m7297[0m
Meridian          [1;36m7289[0m
Warren            [1;36m6584[0m
Conway            [1;36m6574[0m
Cleveland         [1;36m6572[0m
Thomas            [1;36m6571[0m
Name: count, dtype: int64

### `state`

In [18]:
state_percentile = 80

In [19]:
train_df["state"].nunique()

[1;36m51[0m

In [20]:
state_threshold = np.percentile(train_df["state"].value_counts(), q=state_percentile)
print(state_threshold)
(train_df["state"].value_counts() >= state_threshold).sum()

31714.0


[1;36m11[0m

In [21]:
states = (
    train_df["state"]
    .value_counts()[train_df["state"].value_counts() >= state_threshold]
    .index.tolist()
)
states

[1m[[0m[32m'TX'[0m, [32m'NY'[0m, [32m'PA'[0m, [32m'CA'[0m, [32m'OH'[0m, [32m'MI'[0m, [32m'IL'[0m, [32m'FL'[0m, [32m'AL'[0m, [32m'MO'[0m, [32m'MN'[0m[1m][0m

In [22]:
top_states = catalog.load("params:top_categories.states")

assert states == top_states

### Update `state` column in combined raw data

In [23]:
raw_df["state"] = raw_df["state"].apply(lambda x: "Other" if x not in states else x)
raw_df["state"].value_counts()


state
Other    [1;36m988741[0m
TX       [1;36m135269[0m
NY       [1;36m119419[0m
PA       [1;36m114173[0m
CA        [1;36m80495[0m
OH        [1;36m66627[0m
MI        [1;36m65825[0m
IL        [1;36m62212[0m
FL        [1;36m60775[0m
AL        [1;36m58521[0m
MO        [1;36m54904[0m
MN        [1;36m45433[0m
Name: count, dtype: int64

## Undersampling + Split Data into Train and Validation

In [24]:
raw_df = raw_df.drop(["trans_date_trans_time", "dob"], axis=1)

train_df = (
    raw_df[raw_df["dataset"] == "train"].drop("dataset", axis=1).reset_index(drop=True)
)
test_df = (
    raw_df[raw_df["dataset"] == "test"].drop("dataset", axis=1).reset_index(drop=True)
)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (1296675, 8)
Test shape: (555719, 8)


In [25]:
train_df["is_fraud"].value_counts(normalize=True)


is_fraud
[1;36m0[0m    [1;36m0.994211[0m
[1;36m1[0m    [1;36m0.005789[0m
Name: proportion, dtype: float64

In [26]:
target_col = catalog.load("params:target_col")

X = train_df.drop(target_col, axis=1)
y = train_df[target_col]

In [27]:
# Calculate the desired number of fraud cases based on the desired proportion
undersampling_params = catalog.load("params:undersampling")
desired_proportion = undersampling_params["desired_proportion"]
total_samples = undersampling_params["total_samples"]

fraud_samples = int(total_samples * desired_proportion)
print(f"Total number of fraud samples: {fraud_samples}")

Total number of fraud samples: 7000


In [28]:
parameters = catalog.load("parameters")
random_state = parameters["random_state"]


# Create RandomUnderSampler with the desired sampling strategy
rus = RandomUnderSampler(sampling_strategy={0: total_samples - fraud_samples, 1: fraud_samples}, random_state=random_state)

# Apply random undersampling to the original dataset
X_resampled, y_resampled = rus.fit_resample(X, y)

print(f"X_resampled.shape: {X_resampled.shape}")
print(f"y_resampled.shape: {y_resampled.shape}")

X_resampled.shape: (35000, 7)
y_resampled.shape: (35000,)


In [29]:
test_size = catalog.load("params:test_size")
print(test_size)

X_train, X_val, y_train, y_val = train_test_split(
    X_resampled, y_resampled, stratify=y_resampled, test_size=test_size
)

print(f"X_train.shape: {X_train.shape}")
print(f"X_val.shape: {X_val.shape}")
print(f"y_train.shape: {y_train.shape}")
print(f"y_val.shape: {y_val.shape}")

0.2
X_train.shape: (28000, 7)
X_val.shape: (7000, 7)
y_train.shape: (28000,)
y_val.shape: (7000,)


In [30]:
X_train["is_fraud"] = y_train
X_val["is_fraud"] = y_val

In [31]:
X_train["is_fraud"].value_counts(normalize=True)


is_fraud
[1;36m0[0m    [1;36m0.8[0m
[1;36m1[0m    [1;36m0.2[0m
Name: proportion, dtype: float64

In [32]:
X_val["is_fraud"].value_counts(normalize=True)


is_fraud
[1;36m0[0m    [1;36m0.8[0m
[1;36m1[0m    [1;36m0.2[0m
Name: proportion, dtype: float64

## Scale and Transform

### Numeric Features

In [33]:
numeric_features = catalog.load("params:preprocess_features.numeric_features")
numeric_features

[1m{[0m[32m'standard'[0m: [1m[[0m[32m'amt'[0m[1m][0m, [32m'minmax'[0m: [1m[[0m[32m'age'[0m, [32m'hour'[0m, [32m'dayofweek'[0m[1m][0m[1m}[0m

In [34]:
standard_features = numeric_features["standard"]
minmax_features = numeric_features["minmax"]

In [35]:
standard_transformer = Pipeline(steps=[("scaler", StandardScaler())], verbose=True)
minmax_transformer = Pipeline(steps=[("minmax", MinMaxScaler())], verbose=True)

### Categorical Features

In [36]:
categorical_features = catalog.load("params:preprocess_features.categorical_features")
categorical_features

[1m{[0m[32m'onehot'[0m: [1m[[0m[32m'category'[0m, [32m'city'[0m, [32m'state'[0m[1m][0m[1m}[0m

In [37]:
onehot_features = categorical_features["onehot"]

In [38]:
onehot_transformer = Pipeline(steps=[("onehot", OneHotEncoder(sparse_output=False,handle_unknown="ignore"))], verbose=True)

In [39]:
X_train, y_train = X_train.drop(target_col, axis=1), X_train[target_col]
X_val, y_val = X_val.drop(target_col, axis=1), X_val[target_col]

In [40]:
X_train

Unnamed: 0,category,amt,city,state,dayofweek,hour,age
287032,misc_net,665.68,Other,Other,0,23,93
145004,misc_pos,22.79,Other,CA,1,5,60
542042,misc_net,719.30,Other,Other,0,23,81
552188,health_fitness,45.72,Other,TX,5,12,35
204248,health_fitness,61.79,Other,MI,6,19,45
...,...,...,...,...,...,...,...
1256269,shopping_pos,4.79,Other,Other,6,7,94
782694,personal_care,10.51,Other,Other,5,20,32
720112,entertainment,212.86,Other,AL,6,17,47
1170126,home,42.01,Other,Other,6,19,86


### `ColumnTransformer`

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric_standard", standard_transformer, standard_features),
        ("numeric_minmax", minmax_transformer, minmax_features),
        ("categorical_onehot", onehot_transformer, onehot_features),
    ]
)

In [42]:
cat_feats = []
for feat in onehot_features:
    c = X_train[onehot_features][feat].unique().tolist()
    cat_feats.extend(c)

print(len(cat_feats))    
cat_feats

36



[1m[[0m
    [32m'misc_net'[0m,
    [32m'misc_pos'[0m,
    [32m'health_fitness'[0m,
    [32m'food_dining'[0m,
    [32m'home'[0m,
    [32m'gas_transport'[0m,
    [32m'entertainment'[0m,
    [32m'shopping_net'[0m,
    [32m'kids_pets'[0m,
    [32m'grocery_net'[0m,
    [32m'shopping_pos'[0m,
    [32m'grocery_pos'[0m,
    [32m'travel'[0m,
    [32m'personal_care'[0m,
    [32m'Other'[0m,
    [32m'Phoenix'[0m,
    [32m'Conway'[0m,
    [32m'Birmingham'[0m,
    [32m'Cleveland'[0m,
    [32m'San Antonio'[0m,
    [32m'Warren'[0m,
    [32m'Utica'[0m,
    [32m'Thomas'[0m,
    [32m'Meridian'[0m,
    [32m'Other'[0m,
    [32m'CA'[0m,
    [32m'TX'[0m,
    [32m'MI'[0m,
    [32m'IL'[0m,
    [32m'AL'[0m,
    [32m'PA'[0m,
    [32m'FL'[0m,
    [32m'OH'[0m,
    [32m'NY'[0m,
    [32m'MO'[0m,
    [32m'MN'[0m
[1m][0m

In [43]:
processed_train_df = preprocessor.fit_transform(X_train)
processed_train_df = pd.DataFrame(processed_train_df, columns=[*standard_features, *minmax_features, *cat_feats])
processed_train_df["is_fraud"] = y_train.to_list()
processed_train_df

[Pipeline] ............ (step 1 of 1) Processing scaler, total=   0.0s
[Pipeline] ............ (step 1 of 1) Processing minmax, total=   0.0s
[Pipeline] ............ (step 1 of 1) Processing onehot, total=   0.0s


Unnamed: 0,amt,age,hour,dayofweek,misc_net,misc_pos,health_fitness,food_dining,home,gas_transport,...,MI,IL,AL,PA,FL,OH,NY,MO,MN,is_fraud
0,1.760416,0.963415,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1,-0.476746,0.560976,0.217391,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1.947006,0.817073,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,-0.396953,0.256098,0.521739,0.833333,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,-0.341032,0.378049,0.826087,1.000000,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27995,-0.539383,0.975610,0.304348,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
27996,-0.519479,0.219512,0.869565,0.833333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
27997,0.184669,0.402439,0.739130,1.000000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
27998,-0.409863,0.878049,0.826087,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [44]:
processed_val_df = preprocessor.transform(X_val)
processed_val_df = pd.DataFrame(processed_val_df, columns=[*standard_features, *minmax_features, *cat_feats])
processed_val_df["is_fraud"] = y_val.to_list()
processed_val_df

Unnamed: 0,amt,age,hour,dayofweek,misc_net,misc_pos,health_fitness,food_dining,home,gas_transport,...,MI,IL,AL,PA,FL,OH,NY,MO,MN,is_fraud
0,0.446250,0.085366,0.347826,0.166667,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1,-0.466828,0.390244,0.130435,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,2.180504,0.560976,1.000000,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,-0.549579,0.853659,0.130435,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,0.120049,0.219512,0.173913,0.166667,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,-0.292001,0.060976,0.565217,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6996,2.034942,0.341463,1.000000,0.500000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
6997,-0.421730,0.060976,0.652174,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6998,-0.521636,0.414634,0.043478,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
