1.Download and import necessary modules

In [1]:
pip install category-encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category-encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category-encoders
Successfully installed category-encoders-2.6.0


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy.linalg import inv
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import category_encoders as ce
from sklearn.metrics import r2_score
from sklearn.svm import LinearSVC, SVC
import random
from sklearn.metrics import accuracy_score
from sklearn.compose import make_column_transformer
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

2. Load dataset from Kaggle

In [3]:
! pip install -q kaggle

In [4]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"qingchengyu001","key":"2e305b7c5be2de80581dd9cf4779b483"}'}

In [5]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [6]:
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
! kaggle competitions download -c 'ieee-fraud-detection'

Downloading ieee-fraud-detection.zip to /content
 90% 106M/118M [00:00<00:00, 213MB/s] 
100% 118M/118M [00:00<00:00, 203MB/s]


In [8]:
! unzip ieee-fraud-detection.zip

Archive:  ieee-fraud-detection.zip
  inflating: sample_submission.csv   
  inflating: test_identity.csv       
  inflating: test_transaction.csv    
  inflating: train_identity.csv      
  inflating: train_transaction.csv   


3. Load dataset for data preprocessing

In [2]:
# load provided datasets
train_transaction = pd.read_csv('train_transaction.csv')
train_identity = pd.read_csv('train_identity.csv')

# combine transaction and identity datasets as one train dataset
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

4. Deal with Sparse dataset

In [3]:
# The dataset are sparse, only consider the features with less than 30% missing data, drop the columns have higher than 70% missing values
valid_cols = pd.DataFrame(train.isnull().sum()/len(train)*100 < 30)
valid_cols = valid_cols[valid_cols[0] == True]
train = train[list(valid_cols.index)]

In [4]:
# Seperate categorical variables, some categorical variables are stored as numerical values, for further procresses, convert to category data type
train = train.astype({'ProductCD': 'category',
                   'card1': 'category','card2': 'category','card3': 'category','card4': 'category','card5': 'category','card6': 'category',
                   'addr1': 'category','addr2': 'category',
                   'P_emaildomain': 'category',
                   'M6': 'category'})

In [5]:
# Fill missing values in numerical columns with median values
num_cols = train.select_dtypes(include=np.number).columns.tolist()
for column in num_cols:
    train[column].fillna(train[column].median(), inplace=True)

5. Split dataset to development dataset and test dataset

In [6]:
# Seperate X and Y and split to development dataset and test dataset
train_X = train.drop(columns=['isFraud'])
train_Y = train['isFraud']
X_dev, X_test, y_dev, y_test = train_test_split(train_X, train_Y, test_size=0.2, stratify=train_Y, random_state=42)

6. Correlation 

In [7]:
# drop high correlated (>0.9) features for both development dataset and test dataset
corr_matrix = X_dev.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype('bool'))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] >= 0.9)]
X_dev = X_dev.drop(to_drop, axis=1)
X_test = X_test.drop(to_drop, axis=1)

  corr_matrix = X_dev.corr().abs()


7. Encoding and StandardScaler

In [8]:
# One-hot encoding for category features, StandardScaler for numerical feature, Taget encoding for  high cardinality categorical features
num_cols = X_dev.select_dtypes(include=np.number).columns.drop(['TransactionID']).tolist()
te_cols = ['TransactionID','card1', 'card2', 'addr1', 'card5', 'card3', 'addr2', 'P_emaildomain']
cat_cols = X_dev.select_dtypes(exclude=np.number).columns.drop(['card1', 'card2', 'addr1', 'card5', 'card3', 'addr2', 'P_emaildomain']).tolist()
ohe_features = cat_cols
num_features = num_cols
te_features = te_cols

preprocess = make_column_transformer((StandardScaler(), num_features), (OneHotEncoder(handle_unknown='ignore'), ohe_features), (ce.TargetEncoder(return_df=True), te_features))
preprocess.fit(X_dev, y_dev)
X_dev = preprocess.transform(X_dev)
X_test = preprocess.transform(X_test)

8. SMOTE

In [9]:
# SMOTE for imbalanced dataset
smote = SMOTE(random_state=42)
X_dev, y_dev = smote.fit_resample(X_dev, y_dev)

In [10]:
# the size of development dataset after SMOTE
X_dev.shape

(911804, 125)