In [None]:
# Last amended: 2nd August, 2021
# Myfolder: C:\Users\Administrator\OneDrive\Documents\talkingdata
# Ref: https://www.kaggle.com/nanomathias/feature-engineering-importance-testing
#      https://www.kaggle.com/nuhsikander/lgbm-new-features-corrected
#      https://github.com/harnalashok/h2o
#
# Data source: https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection
# Objectives:
#           i)    Categorical feature engineering
#                 using aggregation

t is important that credit card companies are able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.

Content

The datasets contains transactions made by credit cards in September 2013 by European cardholders.
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

Please use the any one of the modeling and hyperparameter tuning techniques  to build predictive model.

In [99]:
# Mount your google drive 
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [27]:
# 1.1 Call libraries
# %reset -f
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random, gc
import os,time

In [28]:
# 1.0 Reset memory
#%reset -f
# 1.1 Call libraries

## A. Data manipulation
import numpy as np
import pandas as pd

# 1.2 for data splitting
from sklearn.model_selection import train_test_split

## B. Transformers for predictors:

# 1.3 Class for imputing missing values
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
from sklearn.impute import SimpleImputer

# 1.4 One hot encode categorical data--Convert to dummy
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
from sklearn.preprocessing import OneHotEncoder as onehot

# 1.5 Scale numeric data
from sklearn.preprocessing import StandardScaler

## C. Transformer for target:

# 1.6 Label encode target column
from sklearn.preprocessing import LabelEncoder


## D. Composite Transformers:

# 1.7 Class for applying multiple data transformation
#     jobs parallely
from sklearn.compose import ColumnTransformer

# 1.8 Pipeline class: Class for applying multiple
#     data transformations sequentially
from sklearn.pipeline import Pipeline

## E. Estimator

# 1.9 Estimator
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# User guide: https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier 

# 1.10 To plot pipeline diagram
from sklearn import set_config

In [29]:
# 1.2 Display output from multiple commands
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [30]:
# 2.0 Import warnings module
import warnings
# 2.1 Do not print warnings on screen
warnings.filterwarnings("ignore")

In [42]:
# 1.3 Change folder to where TalkingData is:
#os.chdir("C:\\Users\\Administrator\\OneDrive\\Documents\\talkingdata")
path =  "/content/drive/MyDrive/Credit Card Default/"
os.listdir(path)

['creditcard.csv.zip']

In [111]:
dtypes = {
        'Time' :'float64',
        'V1'   :'float64',
        'V2'   :'float64',
        'V3' :'float64',
        'V4' :'float64',
        'V5' :'float64',
        'V6' :'float64',
        'V7' :'float64',
        'V8' :'float64',
        'V9' :'float64',
        'V10' :'float64',
        'V11' :'float64',
        'V12' :'float64',
        'V13' :'float64',
        'V14' :'float64',
        'V15' :'float64',
        'V16' :'float64',
        'V17' :'float64',
        'V18' :'float64',
        'V19' :'float64',
        'V20' :'float64',
        'V21' :'float64',
        'V22' :'float64',
        'V23' :'float64',
        'V24' :'float64',
        'V25' :'float64',
        'V26' :'float64',
        'V27' :'float64',
        'V28' :'float64',
        'Amount' :'float64',
        'Class' :'float64',
        }
Creditcard_data = pd.read_csv(
                     path + "creditcard.csv.zip",  # Not reading test.csv.zip
                     header=0,  # First row is header-row
                                # 'and' operator returns True if both values are True
                                #  random.random() returns values between (0,1)
                                #  No of rows skipped will be around 60% of total
                      dtype=dtypes,
                      # We read all columns. Here are the column-names
                      #  in the sequence they occur in the train data
                      usecols=['Time','V1','V2','V3', 'V4', 'V5', 'V6', 'V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19','V20','V21','V22','V23','V24','V25','V26','V27','V28','Amount','Class']
                    )
                   

In [112]:
Creditcard_data.type

AttributeError: ignored

In [113]:
Creditcard_data.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [114]:
y=Creditcard_data.pop('Class'),
y[:3]
X=Creditcard_data,


(0         0.0
 1         0.0
 2         0.0
 3         0.0
 4         0.0
          ... 
 284802    0.0
 284803    0.0
 284804    0.0
 284805    0.0
 284806    0.0
 Name: Class, Length: 284807, dtype: float64,)

In [107]:
print(X),
print(y),

(            Time        V1        V2  ...       V27       V28  Amount
212772  138991.0 -0.145459  0.430788  ... -0.038694  0.075474   98.64
135452   81260.0 -2.938168  1.233004  ...  0.321127 -0.173288   90.00
168025  119031.0  1.935455 -0.767707  ...  0.058235 -0.041527   23.88
154295  101091.0 -0.495445 -0.030701  ... -0.098285 -0.105182   15.00
16831    28202.0 -2.114447 -2.627596  ... -0.291826 -0.031685  375.76
...          ...       ...       ...  ...       ...       ...     ...
206981  136470.0 -0.406907  0.385534  ... -0.121069 -0.097896   75.00
183552  125853.0  1.917987 -0.710925  ... -0.057051 -0.091184   11.00
14746    25978.0 -0.317579  0.060677  ...  0.163547  0.190453   45.87
239192  149999.0  1.983333  0.066404  ... -0.027162 -0.075199   21.49
115659   73959.0  0.482968  1.814470  ...  0.122343 -0.067833    1.99

[284807 rows x 30 columns],)


(None,)

(212772    0.0
135452    0.0
168025    0.0
154295    0.0
16831     0.0
         ... 
206981    0.0
183552    0.0
14746     0.0
239192    0.0
115659    0.0
Name: Class, Length: 284807, dtype: float64,)


(None,)

In [126]:
X= np.asarray(X)
X= X.reshape(-1,30)
X.shape
y= np.asarray(y)
y=y.reshape(284807)
y.shape


(284807, 30)

(284807,)

In [127]:
X_train,X_test, y_train, y_test = train_test_split(
                                                    X, 
                                                    y,
                                                    test_size = 0.2 
                                                    )

In [128]:
f"X_train shape: {X_train.shape}" 
print()
f"X_test.shape : {X_test.shape}"  
print()
f"y_train shape: {y_train.shape}" 
print()
f"y_test shape : {y_test.shape}" 

'X_train shape: (227845, 30)'




'X_test.shape : (56962, 30)'




'y_train shape: (227845,)'




'y_test shape : (56962,)'

In [129]:
X_train_c=X_train.copy()
X_test_c=X_test.copy()
y_test_c=y_test.copy()
y_train_c=y_train.copy()

In [132]:
ss = StandardScaler()
Xtr= ss.fit_transform(X_train_c)
Xtr.shape

(227845, 30)

In [133]:
from sklearn.tree import DecisionTreeClassifier 
dt = DecisionTreeClassifier()
dt.fit(Xtr,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [135]:
from sklearn.metrics import accuracy_score
y_pred= dt.predict(X_test_c)
print(accuracy_score( y_test, y_pred))
print(np.mean(y_test==y_pred))

0.9841648818510585
0.9841648818510585
