# Dataset Fraud E-Commerce

### Import Library

In [None]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
import os

In [None]:
pd.set_option('display.max_columns', 40)

### Import Dataset

In [None]:
data1 = pd.read_csv('/content/Customer_DF (1).csv')
data2= pd.read_csv('/content/cust_transaction_details (1).csv')

In [None]:
data1.head()

In [None]:
data2.head()

In [None]:
data1.shape ,data2.shape 

In [None]:
data1.info()

In [None]:
data2.info()

## Exploratory Data Analysis (EDA)

### Data Cleaning

In [None]:
data1.duplicated().any()

In [None]:
data2.duplicated().any()

### Check Missing Value

In [None]:
data1.isna().any()

In [None]:
data2.isna().any()

In [None]:
## See the difference email in data1 and data2
data1['customerEmail'].nunique() , data2['customerEmail'].nunique()

## Data Visualization

### Which Payment Methods that the most use??

In [None]:
sns.countplot(x='paymentMethodType', data=data2)
ax = plt.gca()

# Iterate through the list of axes' patches
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=12, ha='center', va='bottom')
plt.show()


### Result :
- Payment Method that the most use is card
- For other payment methods are relatively the same in use

### which payment status appears most often??

In [None]:
sns.countplot(x='paymentMethodType',hue='orderState',data = data2)
ax = plt.gca()

# Iterate through the list of axes' patches
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
            fontsize=12, ha='center', va='bottom')
plt.show()

### which payment providers the most 'failure' ???

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='paymentMethodProvider',hue = 'paymentMethodRegistrationFailure',data=data2)
plt.legend (['Success','Fail'])
ax = plt.gca()

# Iterate through the list of axes' patches
for p in ax.patches:
    height = np.nan_to_num(p.get_height(), 0)
    ax.text(p.get_x()+0.2, p.get_y() + height, '%d' % height, 
            fontsize=12, ha='center', va='bottom')
## tambahkan prameter angka
plt.show()

### Total number of payments per Fraud Transactions

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x = 'No_Payments',hue = 'Fraud',data=data1)
ax = plt.gca()

# Iterate through the list of axes' patches
for p in ax.patches:
    height = np.nan_to_num(p.get_height(), 0)
    ax.text(p.get_x()+0.2, p.get_y() + height, '%d' % height, 
            fontsize=12, ha='center', va='bottom')
## tambahkan prameter angka
plt.show()

### Result 
if no of payment more than 4 is Fraud

## Distribution of Transaction Amount

In [None]:
sns.displot(data2['transactionAmount'])

In [None]:
data2[data2['transactionAmount'] > 100]

In [None]:
data1[data1['customerEmail']=='uguzman@yahoo.com']['Fraud']

### Result
There is 1 transaction above 100 USD and also is Fraud

## Comparasion fraud & non fraud transactions

In [None]:
sns.countplot(x= 'Fraud',data=data1)
ax = plt.gca()

# Iterate through the list of axes' patches
for p in ax.patches:
    height = np.nan_to_num(p.get_height(), 0)
    ax.text(p.get_x()+0.4, p.get_y() + height, '%d' % height, 
            fontsize=15, ha='center', va='bottom')
plt.show()

## Feature Engineering

we join the data2 with the data1 because we want to see a pattern for fraud

In [None]:
final = data1[data1['customerEmail'].isin(data2['customerEmail'])== True].reset_index(drop=True)
final.shape

In [None]:
final.head()

we drop columns that have no effect

In [None]:
final.drop('Unnamed: 0',axis = 1, inplace = True)

In [None]:
No_transactionsFail = []
for i in range(0,143):
    s=0
    for j in range(0,623):
        if(final['customerEmail'][i]==data2['customerEmail'][j]):
            s += data2['transactionFailed'][j]
    No_transactionsFail.append(s)
final['No_transactionsFail'] = No_transactionsFail

In [None]:
Total_transaction_amt = []
for i in range(0,143):
    s=0
    for j in range(0,623):
        if(final['customerEmail'][i]==data2['customerEmail'][j]):
            s += data2['transactionAmount'][j]
    Total_transaction_amt.append(s)   
final['Total_transaction_amt'] = Total_transaction_amt

In [None]:
paymentRegFailure = []
for i in range(0,143):
    s=0
    for j in range(0,623):
        if(final['customerEmail'][i]==data2['customerEmail'][j]):
            s += data2['paymentMethodRegistrationFailure'][j]
    paymentRegFailure.append(s)   
final['PaymentRegFail'] = paymentRegFailure

In [None]:
def col_make(column_name,category):
    array = []
    for i in range(0,143):
        s=0
        for j in range(0,623):
            if(final['customerEmail'][i]==data2['customerEmail'][j]):
                if data2[column_name][j]==category:
                    s+=1
        array.append(s)
    return array 

In [None]:
PaypalPayments = col_make('paymentMethodType','paypal')
ApplePayments = col_make('paymentMethodType','apple pay')
BitcoinPayments = col_make('paymentMethodType','bitcoin')
CardPayments = col_make('paymentMethodType','card')

final['PaypalPayments']= PaypalPayments
final['ApplePayments']= ApplePayments
final['CardPayments']= CardPayments
final['BitcoinPayments']= BitcoinPayments

In [None]:
OrdersFulfilled = col_make('orderState','fulfilled')
OrdersFailed =  col_make('orderState','failed')
OrdersPending = col_make('orderState','pending')

final['OrdersFulfilled'] = OrdersFulfilled
final['OrdersPending'] = OrdersPending
final['OrdersFailed'] = OrdersFailed

In [None]:
JCB_16 = col_make('paymentMethodProvider','JCB 16 digit')
AmericanExp = col_make('paymentMethodProvider','American Express')
VISA_16 =  col_make('paymentMethodProvider','VISA 16 digit')
Discover =  col_make('paymentMethodProvider','Discover')
Voyager = col_make('paymentMethodProvider','Voyager')
VISA_13 = col_make('paymentMethodProvider','VISA 13 digit')
Maestro = col_make('paymentMethodProvider','Maestro')
Mastercard = col_make('paymentMethodProvider','Mastercard')
DC_CB =col_make('paymentMethodProvider','Diners Club / Carte Blanche')
JCB_15= col_make('paymentMethodProvider','JCB 15 digit')

final['JCB_16'] = JCB_16
final['AmericanExp'] = AmericanExp 
final['VISA_16'] = VISA_16 
final['Discover'] = Discover
final['Voyager'] = Voyager 
final['VISA_13'] = VISA_13
final['Maestro'] = Maestro 
final['Mastercard'] = Mastercard
final['DC_CB'] = DC_CB 
final['JCB_15'] = JCB_15

In [None]:
Trns_fail_order_fulfilled = []
for i in range(0,143):
    s=0
    for j in range(0,623):
        if(final['customerEmail'][i]==data2['customerEmail'][j]):
            if (data2['orderState'][j]=='fulfilled') & (data2['transactionFailed'][j]==1):
                s+=1
    Trns_fail_order_fulfilled.append(s)
final['Trns_fail_order_fulfilled'] = Trns_fail_order_fulfilled

In [None]:
final.head()

In [None]:
Duplicate_IP = []
for i in range(0,143):
    s=0
    for j in range(0,143):
        if(final['customerIPAddress'][i]==final['customerIPAddress'][j]):
            s+=1
    s-=1        
    Duplicate_IP.append(s)
final['Duplicate_IP'] = Duplicate_IP

In [None]:
Duplicate_Address = []
for i in range(0,143):
    s=0
    for j in range(0,143):
        if(final['customerBillingAddress'][i]==final['customerBillingAddress'][j]):
            s+=1
    s-=1        
    Duplicate_Address.append(s)
final['Duplicate_Address']=Duplicate_Address

In [None]:
final = pd.get_dummies(final,columns=['Fraud'],prefix=['Fraud'],drop_first=True)
final = final.rename(columns={"Fraud_True": "Fraud"})
final.head()

In [None]:
sns.countplot(x= 'Fraud',data=final)
ax = plt.gca()

# Iterate through the list of axes' patches
for p in ax.patches:
    height = np.nan_to_num(p.get_height(), 0)
    ax.text(p.get_x()+0.4, p.get_y() + height, '%d' % height, 
            fontsize=15, ha='center', va='bottom')
plt.show()

In [None]:
final[final['Duplicate_IP']>0]

There are 4 customers who have the same "IP Address" and it is impossible for one device to have the same IP address, so it has to be fraudulent.

In [None]:
final[final['Duplicate_Address']>0]

There are 3 transactions from different customers but have the same customerBillingAddress, can be done by the same customer

In [None]:
sns.countplot(x = final['OrdersFulfilled'], hue = final['Fraud'])
plt.show()

we can see that if the Fulfilled Order Amount is greater than 8 then the transaction is has to be "Fraud"

In [None]:
corr = final.corr()
corr2 = corr.iloc[-1:,:]

plt.figure(figsize=(20, 5))
ax = sns.heatmap(corr2, annot=True, vmin=-1, vmax=1, center=0)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Heatmap Correlation for fraud')
plt.show()

The correlation formed between fraud and other variables tends to be not strong, because the data is too little

### Create variable target

In [None]:
x = final.drop(['customerEmail','customerPhone', 'customerDevice', 'customerIPAddress',
       'customerBillingAddress','Fraud'], axis=1)
y = final['Fraud']

## Split, Train, Test Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train.shape

In [None]:
x_test.shape

## Machine Learning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV

### Logistic Regresion

In [None]:
Lr_model = LogisticRegression(solver='liblinear')
Lr_model.fit(x_train,y_train)

In [None]:
y_pred_lr= Lr_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred_lr))
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot = True)
plt.show()

## KNN

In [None]:
test_error_rate=[]
for k in range(1,30):
    knn_midel=KNeighborsClassifier(n_neighbors=k)
    knn_midel.fit(x_train,y_train)
    y_p_test=knn_midel.predict(x_test)
    test_error=1-accuracy_score(y_test,y_p_test)
    test_error_rate.append(test_error)

In [None]:
plt.figure(figsize=(12,6))
plt.plot(range(1,30),test_error_rate,label='test_error')
plt.legend()
plt.xlabel('k Value')
plt.ylabel('Error')
plt.show()

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 13)
knn_model = knn_model.fit(x_train,y_train)

In [None]:
y_pred_knn = knn_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred_knn))
sns.heatmap(confusion_matrix(y_test, y_pred_knn), annot = True)

## SVM

In [None]:
svc = SVC()
svc = svc.fit(x_train, y_train)
y_pred_svc = svc.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred_svc))
sns.heatmap(confusion_matrix(y_test, y_pred_svc), annot = True)

## Random Forest

In [None]:
model_rf = RandomForestClassifier()
model_rf = model_rf.fit(x_train, y_train)
y_pred_rd = model_rf.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred_rd))
sns.heatmap(confusion_matrix(y_test, y_pred_rd), annot = True)

In [None]:
accuracy, precision, recall, roc = [],[],[],[]
predictions = [y_pred_rd,y_pred_knn,y_pred_svc,y_pred_lr]

               
for i in range (len(predictions)):
    accuracy.append(accuracy_score(y_test, predictions[i]))
    precision.append(precision_score(y_test, predictions[i]))
    recall.append(recall_score(y_test, predictions[i]))
    roc.append(roc_auc_score(y_test, predictions[i]))

compare = pd.DataFrame ({'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'ROC_score':roc}, index=['rf','knn',"svm",'lr'])
compare.sort_values('Accuracy', ascending=False)

In [None]:
model_params = {
     'svm': {
        'model': SVC(),
        'params' : {
            'C' : [1.0, 2.0, 4.0, 6.0, 8.0, 10.0],
            'kernel': ['rbf'],
            'gamma' : ['scale', 'auto']
        }  
    },
    'knn': {
        'model' : KNeighborsClassifier(),
        'params' : {
            'n_neighbors' : [i for i in range(1,50,1)],
            'weights' : ['uniform', 'distance'],
            'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'n_jobs' : [-1]   
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [i for i in range(1,100,5)],
            'criterion' : ['gini', 'entropy'],
            'max_features' : ['auto', 'sqrt', 'log2'],
            'n_jobs' : [-1]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(),
        'params': {
            'C': [1.0, 2.0, 4.0, 6.0, 8.0, 10.0],
            'penalty' : ['l1', 'l2', 'elasticnet'],
            'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'multi_class' : ['auto', 'ovr', 'multinomial'],
            'n_jobs' : [-1]
        }
    }
}

## GridSearchCV

In [None]:
%%time
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
data = pd.DataFrame(scores,columns=['model','best_score','best_params'])
data

## RandomizedSearchCV

In [None]:
%%time
scores = []

for model_name, mp in model_params.items():
    clf_random =  RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf_random.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf_random.best_score_,
        'best_params': clf_random.best_params_
    })
    
data1 = pd.DataFrame(scores,columns=['model','best_score','best_params'])
data1

In [None]:
data['best_params'][2]

In [None]:
model = RandomForestClassifier(n_jobs = -1, n_estimators = 16, max_features = 'auto', criterion = 'gini')

In [None]:
model.fit(x_train, y_train)

In [None]:
importance = model.feature_importances_

In [None]:
kolom = list(x.columns)
kolom

In [None]:
df_imp = pd.DataFrame({'columns' : kolom,
                      'Importance' : importance})

In [None]:
df_imp

In [None]:
df_imp.set_index('columns').sort_values('Importance',ascending=False
                                       ).plot(kind='bar')
plt.show()

In [None]:
df_imp['persen'] = round(df_imp['Importance']*100,2)

In [None]:
df_imp.sort_values('persen',ascending=False)

## Conclusion 

- There are 2 variables that have the most influence in determining fraud or not , that is Total Transaction Amount and Number of Payment
- This model will be more perfect if more data is trained