In [24]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
train_identity = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_identity.csv")
test_identity = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_identity.csv")
train_transaction = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_transaction.csv")
test_transaction =  pd.read_csv("/kaggle/input/ieee-fraud-detection/test_transaction.csv")
# Merging the datasets
train_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test_df = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [3]:
print("The number of records in our training dataset: "+str(train_df.shape[0]))
print("The number of features in our training dataset: "+str(train_df.shape[1]))

The number of records in our training dataset: 590540
The number of features in our training dataset: 434


In [4]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float64(399), int64(4), object(31)
memory usage: 1.9+ GB


In [6]:
print(f"The total number of null columns in our training dataset is: {train_df.isnull().any().sum()}")
print(f"The total number of null columns in our test dataset is: {test_df.isnull().any().sum()}")
train_df["id_02"].isnull().sum()

The total number of null columns in our training dataset is: 414
The total number of null columns in our test dataset is: 385


449668

In [7]:
train_identity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144233 entries, 0 to 144232
Data columns (total 41 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TransactionID  144233 non-null  int64  
 1   id_01          144233 non-null  float64
 2   id_02          140872 non-null  float64
 3   id_03          66324 non-null   float64
 4   id_04          66324 non-null   float64
 5   id_05          136865 non-null  float64
 6   id_06          136865 non-null  float64
 7   id_07          5155 non-null    float64
 8   id_08          5155 non-null    float64
 9   id_09          74926 non-null   float64
 10  id_10          74926 non-null   float64
 11  id_11          140978 non-null  float64
 12  id_12          144233 non-null  object 
 13  id_13          127320 non-null  float64
 14  id_14          80044 non-null   float64
 15  id_15          140985 non-null  object 
 16  id_16          129340 non-null  object 
 17  id_17          139369 non-nul

In [8]:
many_null_cols_test = [col for col in test_df.columns if test_df[col].isnull().sum() / test_df.shape[0] > 0.9]
many_null_cols = [col for col in train_df.columns if train_df[col].isnull().sum() / train_df.shape[0] > 0.9]
# train_df = train_df.drop(columns = ["id_03", "id_04", "id_07", "id_08", "id_09"])

In [9]:
big_top_value_cols = [col for col in train_df.columns if train_df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test_df.columns if test_df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

In [10]:
one_value_cols = [col for col in train_df.columns if train_df[col].nunique() <= 1]
one_value_cols_test = [col for col in test_df.columns if test_df[col].nunique() <= 1]
one_value_cols == one_value_cols_test

False

In [11]:
print(f'There are {len(one_value_cols)} columns in train dataset with one unique value.')
print(f'There are {len(one_value_cols_test)} columns in test dataset with one unique value.')

There are 0 columns in train dataset with one unique value.
There are 41 columns in test dataset with one unique value.


In [12]:
cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols+ one_value_cols_test))
cols_to_drop.remove('isFraud')
len(cols_to_drop)

112

In [13]:
train_df= train_df.drop(cols_to_drop, axis=1)
test_df = test_df.drop(cols_to_drop, axis=1)

In [14]:
def fillmean(columnfloat, df):
    for col in columnfloat:
        df[col] = df[col].fillna(df[col].mean())

In [15]:
def fillmode(columnobj, df):
    for col in columnobj:
        df[col] = df[col].fillna(df[col].mode()[0])

In [16]:
columns_with_null = train_df.columns[train_df.isnull().any()].tolist()
float_columns = [col for col in columns_with_null if train_df[col].dtype == 'float64']
fillmean(float_columns, train_df)
object_columns = [col for col in columns_with_null if train_df[col].dtype == 'object']
fillmode(object_columns, train_df)

In [17]:
train_df.isnull().sum()

TransactionID     0
isFraud           0
TransactionDT     0
TransactionAmt    0
ProductCD         0
                 ..
V335              0
V336              0
V337              0
V338              0
V339              0
Length: 322, dtype: int64

In [18]:
columns_with_null = train_df.columns[train_df.isnull().any()].tolist()
float_columns = [col for col in columns_with_null if train_df[col].dtype == 'float64']
fillmean(float_columns, test_df)
object_columns = [col for col in columns_with_null if train_df[col].dtype == 'object']
fillmode(object_columns, test_df)

In [19]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()


def categorical_labels(categorical_columns, df):
    #'categorical_columns' is a list of categorical column names
    for col in categorical_columns:
        df[col] = label_encoder.fit_transform(df[col])
categorical_columns_train = [col for col in train_df.columns if train_df[col].dtype == 'object']
categorical_columns_test = [col for col in test_df.columns if test_df[col].dtype == 'object']

categorical_labels(categorical_columns_train, train_df)
categorical_labels(categorical_columns_test, test_df)

In [20]:
Y_train = train_df["isFraud"]
X_train = train_df.drop(columns = ["isFraud"])

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

In [None]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster = ['gbtree', 'gblinear']
learning_rate = [0.05, 0.1, 0.15, 0.20]
min_child_weight = [1, 2, 3, 4]

hyperparameter_grid = {'n_estimators':n_estimators,
                      'max_depth': max_depth,
                      'learning_rate': learning_rate,
                      'min_child_weight':min_child_weight}

In [22]:
import xgboost 
classifier = xgboost.XGBRegressor(objective='binary:logistic', eval_metric='logloss')
classifier.fit(X_train, Y_train)

In [25]:
pred = classifier.predict(X_train)
log_loss1 = metrics.log_loss(Y_train, pred)
print(f"Log loss for the training set:{log_loss1} ")

Log loss for the training set:0.05929823687884408 


In [27]:
pred = classifier.predict(X_valid)
log_loss2  = metrics.log_loss(Y_valid, pred)
print(f"RMSE for the validation set:{log_loss2} ")

RMSE for the validation set:0.0687196274549187 


In [28]:
y_pred = classifier.predict(test_df)

In [29]:
y_pred = pd.DataFrame(y_pred)
print(y_pred.isnull().sum())
test_id = test_df["TransactionID"]
concat = pd.concat([test_id, y_pred], axis = 1)
concat.columns = ["TransactionID", "isFraud"]


0    0
dtype: int64


In [30]:
concat[['TransactionID', 'isFraud']].to_csv('ieee_fraud.csv', index = False)