# Xente Fraud Detection Challenge

Xente is an e-commerce platform and the objective of this competition is to create a machine learning model to detect fraudulent transactions.

### Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### Reading the train and test csv files

In [2]:
train = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')

In [3]:
train.FraudResult.value_counts()

0    95469
1      193
Name: FraudResult, dtype: int64

In [None]:
# This shows data is unbalance data, there are only fewer fraudulent transactions amidst 10,000 samples

In [4]:
# Checking the information of the datasets
train.info()
print('-' *50)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
TransactionId           95662 non-null object
BatchId                 95662 non-null object
AccountId               95662 non-null object
SubscriptionId          95662 non-null object
CustomerId              95662 non-null object
CurrencyCode            95662 non-null object
CountryCode             95662 non-null int64
ProviderId              95662 non-null object
ProductId               95662 non-null object
ProductCategory         95662 non-null object
ChannelId               95662 non-null object
Amount                  95662 non-null float64
Value                   95662 non-null int64
TransactionStartTime    95662 non-null object
PricingStrategy         95662 non-null int64
FraudResult             95662 non-null int64
dtypes: float64(1), int64(4), object(11)
memory usage: 11.7+ MB
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Range

### Preprocessing

In [5]:
# Merging the train and test data for easy preprocessing
df = pd.concat([train,test], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140681 entries, 0 to 140680
Data columns (total 16 columns):
AccountId               140681 non-null object
Amount                  140681 non-null float64
BatchId                 140681 non-null object
ChannelId               140681 non-null object
CountryCode             140681 non-null int64
CurrencyCode            140681 non-null object
CustomerId              140681 non-null object
FraudResult             95662 non-null float64
PricingStrategy         140681 non-null int64
ProductCategory         140681 non-null object
ProductId               140681 non-null object
ProviderId              140681 non-null object
SubscriptionId          140681 non-null object
TransactionId           140681 non-null object
TransactionStartTime    140681 non-null object
Value                   140681 non-null int64
dtypes: float64(2), int64(3), object(11)
memory usage: 17.2+ MB


In [6]:
# Checking the first 5 samples in the dataset(df)
df.head()

Unnamed: 0,AccountId,Amount,BatchId,ChannelId,CountryCode,CurrencyCode,CustomerId,FraudResult,PricingStrategy,ProductCategory,ProductId,ProviderId,SubscriptionId,TransactionId,TransactionStartTime,Value
0,AccountId_3957,1000.0,BatchId_36123,ChannelId_3,256,UGX,CustomerId_4406,0.0,2,airtime,ProductId_10,ProviderId_6,SubscriptionId_887,TransactionId_76871,2018-11-15T02:18:49Z,1000
1,AccountId_4841,-20.0,BatchId_15642,ChannelId_2,256,UGX,CustomerId_4406,0.0,2,financial_services,ProductId_6,ProviderId_4,SubscriptionId_3829,TransactionId_73770,2018-11-15T02:19:08Z,20
2,AccountId_4229,500.0,BatchId_53941,ChannelId_3,256,UGX,CustomerId_4683,0.0,2,airtime,ProductId_1,ProviderId_6,SubscriptionId_222,TransactionId_26203,2018-11-15T02:44:21Z,500
3,AccountId_648,20000.0,BatchId_102363,ChannelId_3,256,UGX,CustomerId_988,0.0,2,utility_bill,ProductId_21,ProviderId_1,SubscriptionId_2185,TransactionId_380,2018-11-15T03:32:55Z,21800
4,AccountId_4841,-644.0,BatchId_38780,ChannelId_2,256,UGX,CustomerId_988,0.0,2,financial_services,ProductId_6,ProviderId_4,SubscriptionId_3829,TransactionId_28195,2018-11-15T03:34:21Z,644


In [7]:
df.shape

(140681, 16)

In [8]:
# Columns/fields in df
df.columns

Index(['AccountId', 'Amount', 'BatchId', 'ChannelId', 'CountryCode',
       'CurrencyCode', 'CustomerId', 'FraudResult', 'PricingStrategy',
       'ProductCategory', 'ProductId', 'ProviderId', 'SubscriptionId',
       'TransactionId', 'TransactionStartTime', 'Value'],
      dtype='object')

In [9]:
df.CurrencyCode.unique()

array(['UGX'], dtype=object)

In [None]:
# Since the only unique currency "UGX", it will be dropped

In [10]:
# Drop some columns that are presumed not useful for the model
df = df.drop([ 'BatchId','AccountId', 'ChannelId','ProviderId', 'CustomerId','ProductId','SubscriptionId', 'TransactionId', 'CountryCode',
       'CurrencyCode'], axis=1)

In [11]:
df.head()

Unnamed: 0,Amount,FraudResult,PricingStrategy,ProductCategory,TransactionStartTime,Value
0,1000.0,0.0,2,airtime,2018-11-15T02:18:49Z,1000
1,-20.0,0.0,2,financial_services,2018-11-15T02:19:08Z,20
2,500.0,0.0,2,airtime,2018-11-15T02:44:21Z,500
3,20000.0,0.0,2,utility_bill,2018-11-15T03:32:55Z,21800
4,-644.0,0.0,2,financial_services,2018-11-15T03:34:21Z,644


In [12]:
df.PricingStrategy.unique()

array([2, 4, 1, 0], dtype=int64)

In [13]:
# df.ProviderId=df.ProviderId.str.extract('(\d+)')

In [14]:
df.ProductCategory.unique()

array(['airtime', 'financial_services', 'utility_bill', 'data_bundles',
       'tv', 'transport', 'ticket', 'movies', 'other', 'retail'],
      dtype=object)

### Feature Engineering

New Columns are created to differentiate Credit transactions from debit transactions 

In [15]:
df['CreditTran'] = df["Amount"][df['Amount'] <=0] 

In [16]:
df['DebitTran'] = df["Amount"][df['Amount'] >=0] 

In [17]:
df['CreditTran'].fillna(0, inplace=True)
df['DebitTran'].fillna(0, inplace=True)

In [18]:
df.ProductCategory.unique()

array(['airtime', 'financial_services', 'utility_bill', 'data_bundles',
       'tv', 'transport', 'ticket', 'movies', 'other', 'retail'],
      dtype=object)

In [19]:
df['ProductCategory'][df['ProductCategory']=='retail'] ='other'

In [20]:
df['ProductCategory'][df['ProductCategory']=='financial_services'] =4
df['ProductCategory'][df['ProductCategory']=='airtime'] =3
df['ProductCategory'][df['ProductCategory']=='utility_bill'] =2
df['ProductCategory'][df['ProductCategory']=='transport'] =1
df['ProductCategory'][df['ProductCategory']=='data_bundles'] =0
df['ProductCategory'][df['ProductCategory']=='tv'] =0
df['ProductCategory'][df['ProductCategory']=='ticket'] =0
df['ProductCategory'][df['ProductCategory']=='movies'] =0
df['ProductCategory'][df['ProductCategory']=='other'] =0
df['ProductCategory'].unique()

array([3, 4, 2, 0, 1], dtype=object)

In [21]:
# df = pd.get_dummies(df, columns=['ProductCategory'])

In [22]:
df.drop(['Amount'], axis=1, inplace=True)

In [23]:
 df.head()

Unnamed: 0,FraudResult,PricingStrategy,ProductCategory,TransactionStartTime,Value,CreditTran,DebitTran
0,0.0,2,3,2018-11-15T02:18:49Z,1000,0.0,1000.0
1,0.0,2,4,2018-11-15T02:19:08Z,20,-20.0,0.0
2,0.0,2,3,2018-11-15T02:44:21Z,500,0.0,500.0
3,0.0,2,2,2018-11-15T03:32:55Z,21800,0.0,20000.0
4,0.0,2,4,2018-11-15T03:34:21Z,644,-644.0,0.0


In [24]:
# Converting the date object type to datetime series
df.TransactionStartTime = pd.to_datetime(df.TransactionStartTime)

In [25]:
# Feature engineering to create hour, month and day of the week columns 
df['Hour'] = df['TransactionStartTime'].apply(lambda time: time.hour)
df['Month'] = df['TransactionStartTime'].apply(lambda time: time.month)
df['Day of Week'] = df['TransactionStartTime'].apply(lambda time: time.dayofweek)

In [26]:
df.head()

Unnamed: 0,FraudResult,PricingStrategy,ProductCategory,TransactionStartTime,Value,CreditTran,DebitTran,Hour,Month,Day of Week
0,0.0,2,3,2018-11-15 02:18:49,1000,0.0,1000.0,2,11,3
1,0.0,2,4,2018-11-15 02:19:08,20,-20.0,0.0,2,11,3
2,0.0,2,3,2018-11-15 02:44:21,500,0.0,500.0,2,11,3
3,0.0,2,2,2018-11-15 03:32:55,21800,0.0,20000.0,3,11,3
4,0.0,2,4,2018-11-15 03:34:21,644,-644.0,0.0,3,11,3


In [27]:
# dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}

In [28]:
# df['Day of Week'] = df['Day of Week'].map(dmap)

In [29]:
# df['Hour'][df['Hour'] 8<='Hour'<=12]df['Hour']=df

In [30]:
# byMonth = df.groupby('Month').count()
# byMonth.head()

In [31]:
df.head()

Unnamed: 0,FraudResult,PricingStrategy,ProductCategory,TransactionStartTime,Value,CreditTran,DebitTran,Hour,Month,Day of Week
0,0.0,2,3,2018-11-15 02:18:49,1000,0.0,1000.0,2,11,3
1,0.0,2,4,2018-11-15 02:19:08,20,-20.0,0.0,2,11,3
2,0.0,2,3,2018-11-15 02:44:21,500,0.0,500.0,2,11,3
3,0.0,2,2,2018-11-15 03:32:55,21800,0.0,20000.0,3,11,3
4,0.0,2,4,2018-11-15 03:34:21,644,-644.0,0.0,3,11,3


In [32]:
df.drop(['TransactionStartTime'], axis=1, inplace=True)

In [33]:
# Splitting df into train and test sets after preproceesing
train2 = df[:95662]
test2 = df[95662:]

In [34]:
test2.drop(['FraudResult'], axis=1, inplace=True)

In [35]:
X = train2.drop(['FraudResult'], axis=1)
y = train2['FraudResult']

In [1]:
# Standardizing, scaling and transformation

In [36]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

In [37]:
X = std.fit_transform(X)

### Modeling

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 3)

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
from sklearn.metrics import accuracy_score,precision_score,f1_score,confusion_matrix, classification_report

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
pre=rf.predict(X_test)
accuracy=accuracy_score(y_test,pre)
print(accuracy)
print('-'*50)
print(confusion_matrix( y_pred, y_test))
print('\n')
print(classification_report( y_test, pre))

0.9995818742486803
--------------------------------------------------
[[19101    14]
 [    8    10]]


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     19109
         1.0       0.81      0.88      0.84        24

   micro avg       1.00      1.00      1.00     19133
   macro avg       0.90      0.94      0.92     19133
weighted avg       1.00      1.00      1.00     19133



In [50]:
# gri.best_score_

In [51]:
test2 = std.transform(test2)

In [52]:
predictions = rf.predict(test2)

In [53]:
fraud = pd.DataFrame({'TransactionId':test.TransactionId, 'FraudResult':predictions})

In [54]:
fraud.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,0.0
1,TransactionId_95109,0.0
2,TransactionId_47357,0.0
3,TransactionId_28185,0.0
4,TransactionId_22140,0.0


In [None]:
sss

In [55]:
fraud.to_csv('my_fraud2.csv', index=False)

In [56]:
fraud.FraudResult.value_counts()

0.0    44941
1.0       78
Name: FraudResult, dtype: int64

In [57]:
test.shape

(45019, 15)