In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import gc
import time

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier


In [102]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [103]:
# Helper function
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [104]:
identity = pd.read_csv ('/kaggle/input/ieee-fraud-detection/train_identity.csv')
identity = reduce_mem_usage(identity)
identity.shape

Memory usage of dataframe is 45.12 MB
Memory usage after optimization is: 10.00 MB
Decreased by 77.8%


(144233, 41)

In [107]:
identity.dtypes

TransactionID       int32
id_01             float16
id_02             float32
id_03             float16
id_04             float16
id_05             float16
id_06             float16
id_07             float16
id_08             float16
id_09             float16
id_10             float16
id_11             float16
id_12            category
id_13             float16
id_14             float16
id_15            category
id_16            category
id_17             float16
id_18             float16
id_19             float16
id_20             float16
id_21             float16
id_22             float16
id_23            category
id_24             float16
id_25             float16
id_26             float16
id_27            category
id_28            category
id_29            category
id_30            category
id_31            category
id_32             float16
id_33            category
id_34            category
id_35            category
id_36            category
id_37            category
id_38       

In [99]:
transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
transaction = reduce_mem_usage(transaction)
transaction.shape

Memory usage of dataframe is 1775.15 MB
Memory usage after optimization is: 487.16 MB
Decreased by 72.6%


(590540, 394)

In [106]:
transaction.dtypes

TransactionID        int32
isFraud               int8
TransactionDT        int32
TransactionAmt     float16
ProductCD         category
card1                int16
card2              float16
card3              float16
card4             category
card5              float16
card6             category
addr1              float16
addr2              float16
dist1              float16
dist2              float16
P_emaildomain     category
R_emaildomain     category
C1                 float16
C2                 float16
C3                 float16
C4                 float16
C5                 float16
C6                 float16
C7                 float16
C8                 float16
C9                 float16
C10                float16
C11                float16
C12                float16
C13                float16
C14                float16
D1                 float16
D2                 float16
D3                 float16
D4                 float16
D5                 float16
D6                 float16
D

In [86]:
train_merged = transaction.merge(identity, how = 'inner', on = 'TransactionID')
train_merged.shape

(144233, 434)

In [87]:
del identity 
del transaction 
gc.collect()

0

In [88]:
(train_merged.isFraud.value_counts()/len(train_merged)).to_frame()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,0.92153
1,0.07847


In [89]:
null_percentage = (train_merged.isnull().mean()) * 100

# Display the percentage of null values in each column
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print("Percentage of null values in each column:")
    print(null_percentage)

Percentage of null values in each column:
TransactionID       0.000000
isFraud             0.000000
TransactionDT       0.000000
TransactionAmt      0.000000
ProductCD           0.000000
card1               0.000000
card2               0.625377
card3               0.119251
card4               0.127571
card5               0.662816
card6               0.123411
addr1              41.909272
addr2              41.909272
dist1             100.000000
dist2              73.935923
P_emaildomain       9.284283
R_emaildomain       9.117192
C1                  0.000000
C2                  0.000000
C3                  0.000000
C4                  0.000000
C5                  0.000000
C6                  0.000000
C7                  0.000000
C8                  0.000000
C9                  0.000000
C10                 0.000000
C11                 0.000000
C12                 0.000000
C13                 0.000000
C14                 0.000000
D1                  0.151144
D2                 78.426574
D

In [90]:
y_train = train_merged['isFraud']
train = train_merged.drop(columns=['isFraud'])

In [108]:
# filter numerical data
num_features = train.select_dtypes(include=['float16','int16', 'int32'])

# filter categorical data
cat_features = train.select_dtypes(exclude=['category'])

print(num_features)
print(cat_features)

In [117]:
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Create a Pipeline for processing categorical features
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant',)),
    ('encoder', OneHotEncoder(handle_unknown = 'ignore'))
])

# Create a ColumnTransformer to combine both pipelines
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features.columns),
    ('cat', cat_transformer, cat_features.columns)],remainder='drop'
)

In [118]:
preprocessor.fit(train_merged)
x_train = preprocessor.transform(train_merged)

print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)

x_train shape: (144233, 769302)
y_train shape: (144233,)
  (0, 0)	-1.3940552049075918
  (0, 1)	-1.2647326873692064
  (0, 2)	-0.33604681698727845
  (0, 3)	-1.066246363863179
  (0, 4)	0.7675541205287082
  (0, 5)	-0.6029085292465185
  (0, 6)	-1.9980996347996913
  (0, 7)	1.627122989453093
  (0, 8)	0.18638003779790313
  (0, 10)	-0.10728521314426362
  (0, 11)	-0.11446575021285045
  (0, 12)	-0.0758091101120629
  (0, 13)	-0.11825913110841374
  (0, 15)	-0.11004950400935672
  (0, 16)	-0.08982493543432392
  (0, 17)	-0.10160924484722787
  (0, 19)	-0.1033641558457088
  (0, 20)	-0.10951207597504588
  (0, 21)	-0.0902078943001981
  (0, 22)	-0.11781875646994357
  (0, 23)	-0.1089590401019826
  (0, 24)	-0.30815774980075794
  (0, 25)	-5.583947390789445e-16
  (0, 28)	-2.766419646570402e-16
  (0, 29)	1.4316541474849387e-16
  :	:
  (144232, 650098)	1.0
  (144232, 650476)	1.0
  (144232, 650847)	1.0
  (144232, 748638)	1.0
  (144232, 766555)	1.0
  (144232, 766580)	1.0
  (144232, 766613)	1.0
  (144232, 766674)	1

In [119]:
x_sample, x_valid, y_sample, y_valid = train_test_split(x_train, y_train, 
        test_size=0.8, stratify=y_train, random_state=1)

In [None]:
%%time 

lr_clf = LogisticRegression(max_iter=1000, solver='saga', penalty='elasticnet')
lr_parameters = {
    'l1_ratio':[0, 0.5],
    'C': [0.01, 10 ]
}

lr_grid = GridSearchCV(lr_clf, lr_parameters, cv=5, refit='True', n_jobs=-1, verbose=10, scoring= 'accuracy')
lr_grid.fit(x_train, y_train)
lr_model = lr_grid.best_estimator_

print('Best Parameters:', lr_grid.best_params_)
print('Best CV Score:  ', lr_grid.best_score_)
print('Training Acc:   ', lr_model.score(x_train, y_train))

Fitting 5 folds for each of 4 candidates, totalling 20 fits




In [None]:
%%time 

gbm_clf = GradientBoostingClassifier()
gbm_parameters = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

gbm_grid = GridSearchCV(gbm_clf, gbm_parameters, cv=5, n_jobs=-1, verbose=10, scoring='accuracy')
gbm_grid.fit(x_train, y_train)
gbm_model = gbm_grid.best_estimator_

print('Best GBM Parameters:', gbm_grid.best_params_)
print('Best GBM CV Score:', gbm_grid.best_score_)
print('Training Acc (GBM):', gbm_model.score(x_train, y_train))

In [None]:
train_new.dtypes
