# Creating DataFrame and Necessary Import Statements

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pylab as plt
from cycler import cycler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from fancyimpute import IterativeImputer
from xgboost import XGBClassifier

In [2]:
!pip install kaggle --upgrade



In [3]:
!echo "{\"username\":\"tombohorig\",\"key\":\"81f7fcd3a57da42eb3f3c20e0417fdfe\"}" > kaggle.json

In [4]:
!sudo mkdir -p ~/.kaggle
!sudo cp /content/kaggle.json ~/.kaggle/kaggle.json

In [5]:
!chmod 600 /root/.kaggle/kaggle.json

In [6]:
!kaggle --version

Kaggle API 1.5.4


In [7]:
!kaggle competitions download -c ieee-fraud-detection

train_transaction.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
test_transaction.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train_identity.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
test_identity.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [8]:
np.random.seed(314159)
train_txn = pd.read_csv('/content/train_transaction.csv.zip', compression='zip')
train_id = pd.read_csv('/content/train_identity.csv.zip', compression='zip')

In [9]:
train_txn.shape

(590540, 394)

In [10]:
train_id.shape

(144233, 41)

# Cleaning Data

### Drop columns with null values over a certain threshold

In [76]:
def remove_heavy_null_columns(df, threshold, n_rows):
  cols_over_threshold = []
  for column in df:
    missing_values = df[column].isna().sum()
    if missing_values != 0 and missing_values/n_rows >= threshold:
      cols_over_threshold.append(column)
  df.drop(cols_over_threshold, axis=1, inplace=True)

threshold = 0.5
remove_heavy_null_columns(df=train_txn, threshold=threshold, n_rows=train_txn.shape[0])
remove_heavy_null_columns(df=train_id, threshold=threshold, n_rows=train_txn.shape[0])

### Perform MICE Imputation on remaining Nan Cells
After deciding to use a XGBoost Model and learning that it can deal with Nan Values and learn from them I will not impute these values

In [14]:
# def fill_missing_values(df):
#   # get all column names:
#   print(df.head(20))
#   column_names = df.columns
#   # mice = IterativeImputer()
#   # return pd.DataFrame(mice.fit_transform(df), columns=df.columns)

# # print("BEFORE IMPUTATION:")
# # print(train_id.isna().sum())
# # # print(penguins.isna().sum())

# fill_missing_values(df=train_id)

# # print()
# # print("AFTER IMPUTATION:")
# # print(train_id.isna().sum())
# # # print(penguins.isna().sum())

# Convert columns with String to be One Hot Encoded as converting to numeric in same column will give undesired order precedence

In [12]:
# One hot encodings:
def one_hot_encoding(df, column_name):
  encoding = pd.get_dummies(df[column_name])
  df = df.drop(column_name, axis = 1)
  df = df.join(encoding)  
  return df

Preform Cleaning on train_id

In [77]:
# Transormations:
boolean_found = {'NotFound': 0, 'Found': 1, None: 0}
boolean_new = {'New': 0, 'Found': 1, None: 0}
true_false = {'T': 2, 'F': 1, None: 0}
device_type = {None: 0, 'mobile': 1, 'desktop': 2}
train_id['id_12'].replace(boolean_found, inplace=True)
train_id['id_16'].replace(boolean_found, inplace=True)
train_id['id_27'].replace(boolean_found, inplace=True)
train_id['id_28'].replace(boolean_new, inplace=True)
train_id['id_29'].replace(boolean_found, inplace=True)
train_id['id_35'].replace(true_false, inplace=True)
train_id['id_36'].replace(true_false, inplace=True)
train_id['id_37'].replace(true_false, inplace=True)
train_id['id_38'].replace(true_false, inplace=True)
train_id['DeviceType'].replace(device_type, inplace=True)

# One Hot Encodings:
train_id = one_hot_encoding(df=train_id, column_name='id_15')
train_id = one_hot_encoding(df=train_id, column_name='id_23')
train_id = one_hot_encoding(df=train_id, column_name='id_34')

# I must drop instead of one hot encode b/c they make my df run out of ram:
train_id = train_id.drop('id_30', axis=1) # I am dropping this column b/c it conflicts w/ id_31
# train_id = one_hot_encoding(df=train_id, column_name='id_31')
# train_id = one_hot_encoding(df=train_id, column_name='id_33')
# train_id = one_hot_encoding(df=train_id, column_name='DeviceInfo')
train_id = train_id.drop('id_31', axis=1)
train_id = train_id.drop('id_33', axis=1)
train_id = train_id.drop('DeviceInfo', axis=1)


train_id.head(10)
print(train_id.shape)

(144233, 44)


Preform Cleaning on Transactions

In [78]:
# Transormations:
true_false = {'T': 2, 'F': 1, None: 0}
train_txn['M1'].replace(true_false, inplace=True)
train_txn['M2'].replace(true_false, inplace=True)
train_txn['M3'].replace(true_false, inplace=True)
train_txn['M6'].replace(true_false, inplace=True)

# One Hot Encoding:
train_txn = one_hot_encoding(df=train_txn, column_name='ProductCD')
train_txn = one_hot_encoding(df=train_txn, column_name='card4')
train_txn = one_hot_encoding(df=train_txn, column_name='card6')
train_txn = one_hot_encoding(df=train_txn, column_name='P_emaildomain')

# train_txn['R_emaildomain'] = 'RECIPIENT' + train_txn['R_emaildomain'].astype(str)
# train_txn = one_hot_encoding(df=train_txn, column_name='R_emaildomain')

train_txn['M4'] = 'M4 ' + train_txn['M4'].astype(str)
train_txn = one_hot_encoding(df=train_txn, column_name='M4')

train_txn.head(10)
train_txn.shape

(590540, 291)

# XGBoost Model

### Combine the models together and train/test split


In [79]:
# I've chosen an outer join b/c we previously discusses that it is better to not reduce the rows to the size of train_id but to keep the rows from train_txn:
df = pd.merge(train_txn, train_id, on='TransactionID', how='inner')
df.shape

(144233, 334)

In [80]:
# Train/Test Split:
# split the dataset into train features and target varaible
X = df.drop('isFraud', axis=1)
y = df['isFraud']
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2)

### Create Model

In [81]:
# Drop V columns even though we said theyre important in class. I just keep running out of RAM.
v_columns = []
for i in range (4,339):
  try:
    df.drop(f'V{i+1}')
  except:
    pass

print(df.shape)

(144233, 334)


In [82]:
# Create model:

xgb_clf = XGBClassifier()
xgb_clf.fit(train_x, train_y)
# Print the accuracy score
print(xgb_clf.score(test_x, test_y))

0.9565639407910701


In [92]:
# Check if better then just guessing no every time:
not_fraud, fraud = df['isFraud'].value_counts()
fraud_propotion = fraud/not_fraud
print(f"The proportion of fraud is {fraud_propotion}.")
print(f"My model is {0.9565639407910701-(1-fraud_propotion)} percent better than guessing yes every time.")

The proportion of fraud is 0.08515216491742843.
My model is 0.041716105708498485 percent better than guessing yes every time.
