In [0]:
# Imports.
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import re
import datetime

# Machine Learning.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

from sklearn.utils import resample

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [0]:
# Data
test = pd.read_csv('test.csv')
train = pd.read_csv('training.csv')
train_test_combined = pd.concat([train,test], sort = False)

In [52]:
# Data Shape
train.shape, test.shape, train_test_combined.shape

((95662, 16), (45019, 15), (140681, 16))

In [53]:
# Checking for missing data 
train_test_combined.isnull().sum()

TransactionId               0
BatchId                     0
AccountId                   0
SubscriptionId              0
CustomerId                  0
CurrencyCode                0
CountryCode                 0
ProviderId                  0
ProductId                   0
ProductCategory             0
ChannelId                   0
Amount                      0
Value                       0
TransactionStartTime        0
PricingStrategy             0
FraudResult             45019
dtype: int64

In [54]:
# Checking dtypes
train_test_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140681 entries, 0 to 45018
Data columns (total 16 columns):
TransactionId           140681 non-null object
BatchId                 140681 non-null object
AccountId               140681 non-null object
SubscriptionId          140681 non-null object
CustomerId              140681 non-null object
CurrencyCode            140681 non-null object
CountryCode             140681 non-null int64
ProviderId              140681 non-null object
ProductId               140681 non-null object
ProductCategory         140681 non-null object
ChannelId               140681 non-null object
Amount                  140681 non-null float64
Value                   140681 non-null int64
TransactionStartTime    140681 non-null object
PricingStrategy         140681 non-null int64
FraudResult             95662 non-null float64
dtypes: float64(2), int64(3), object(11)
memory usage: 18.2+ MB


In [0]:
# Removing text from columns and converting them to numerical dtype
def remove_text(string):
  string = re.sub('[a-zA-Z]+_', '', string)
  return string

train_test_combined['TransactionId'] = train_test_combined['TransactionId'].apply(remove_text)
train_test_combined['TransactionId'] = train_test_combined['TransactionId'].astype('int')

train_test_combined['BatchId'] = train_test_combined['BatchId'].apply(remove_text)
train_test_combined['BatchId'] = train_test_combined['BatchId'].astype('int')

train_test_combined['AccountId'] = train_test_combined['AccountId'].apply(remove_text)
train_test_combined['AccountId'] = train_test_combined['AccountId'].astype('int')

train_test_combined['SubscriptionId'] = train_test_combined['SubscriptionId'].apply(remove_text)
train_test_combined['SubscriptionId'] = train_test_combined['SubscriptionId'].astype('int')

train_test_combined['CustomerId'] = train_test_combined['CustomerId'].apply(remove_text)
train_test_combined['CustomerId'] = train_test_combined['CustomerId'].astype('int')

train_test_combined['ProviderId'] = train_test_combined['ProviderId'].apply(remove_text)
train_test_combined['ProviderId'] = train_test_combined['ProviderId'].astype('int')

train_test_combined['ProductId'] = train_test_combined['ProductId'].apply(remove_text)
train_test_combined['ProductId'] = train_test_combined['ProductId'].astype('int')

train_test_combined['ChannelId'] = train_test_combined['ChannelId'].apply(remove_text)
train_test_combined['ChannelId'] = train_test_combined['ChannelId'].astype('int')

In [0]:
# Handling Date and Time
"""train_test_combined['TransactionStartTime'] = pd.to_datetime(train_test_combined['TransactionStartTime'])
train_test_combined['date'] = [d.date() for d in train_test_combined['TransactionStartTime']]
train_test_combined['time'] = [d.time() for d in train_test_combined['TransactionStartTime']]"""

train_test_combined = train_test_combined.drop(columns = ['TransactionStartTime'])

In [57]:
# Checking length is the same 
len(train_test_combined) == (len(train) + len(test))

True

In [58]:
drop = ['TransactionId','PricingStrategy','Value','ProductCategory','CountryCode','CurrencyCode','CustomerId','SubscriptionId','AccountId','BatchId']
train_test_combined = train_test_combined.drop(columns = drop)
train_test_combined.head()

Unnamed: 0,ProviderId,ProductId,ChannelId,Amount,FraudResult
0,6,10,3,1000.0,0.0
1,4,6,2,-20.0,0.0
2,6,1,3,500.0,0.0
3,1,21,3,20000.0,0.0
4,4,6,2,-644.0,0.0


In [59]:
# Checking for class imbalance
fraud = len(train_test_combined[train_test_combined['FraudResult'] == 1])
not_fraud = len(train_test_combined[train_test_combined['FraudResult'] == 0])
print('Fraud : ', fraud)
print('Not Fraud : ', not_fraud)

Fraud :  193
Not Fraud :  95469


In [60]:
# Splitting processed data into train and test.

train_test_combined = train_test_combined.drop(columns = ['FraudResult'])

train_test_combined = pd.get_dummies(train_test_combined)

y = train['FraudResult']
X = train_test_combined[:len(y)]
X_sub = train_test_combined[len(y):]

X.shape, X_sub.shape, y.shape

((95662, 4), (45019, 4), (95662,))

In [0]:
from imblearn.over_sampling import SMOTE

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=22 , stratify = y)

sm = SMOTE(random_state=22)
smote_X, smote_y = sm.fit_sample(X_train, y_train)
smote_X = pd.DataFrame(data = smote_X, columns = X_train.columns )
smote_y = pd.DataFrame(data = smote_y, columns = ['FraudResult'] )

smote_data = smote_X
smote_data['FraudResult'] = smote_y['FraudResult']

In [62]:
X = smote_data.drop(columns = ['FraudResult'])
y = smote_data['FraudResult']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X,y, test_size=0.25, random_state=22, stratify = y)

LR_model = LogisticRegression()



grid_parameters = {"C":[0.05,0.5,1,5,30], 
                   "solver" : ['lbfgs']}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)

LR_model_GSCV = GridSearchCV(LR_model, param_grid = grid_parameters, scoring = 'f1', cv = kfold)
LR_model_GSCV.fit(X_train1, y_train1)
print('Parameters',LR_model_GSCV.best_params_)

Parameters {'C': 0.05, 'solver': 'lbfgs'}


In [63]:
y_pred = LR_model_GSCV.predict(X_test1) #Resampled data test
print("Accuracy : " , accuracy_score(y_test1, y_pred))
print("Recall : " , recall_score(y_test1, y_pred))
print("F1 Score : " , f1_score(y_test1, y_pred))

Accuracy :  0.9433051071712093
Recall :  0.8888153832583576
F1 Score :  0.9400377824941111


In [64]:
y_pred = LR_model_GSCV.predict(X_test) # Unbalanced data test
print("Accuracy : " , accuracy_score(y_test, y_pred))
print("Recall : " , recall_score(y_test, y_pred))
print("F1 Score : " , f1_score(y_test, y_pred))

Accuracy :  0.9966555183946488
Recall :  1.0
F1 Score :  0.5555555555555556


In [0]:
submission_dict = {'TransactionId' : test['TransactionId'], 'FraudResult' : LR_model_GSCV.predict(X_sub)}
submission_df = pd.DataFrame(submission_dict)
submission_df.to_csv('submission-GridSearch7.csv', index= False)