In [None]:
# Importing required libraries

import numpy as np
import pandas as pd
pd.set_option('display.max_rows',1000)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Reading the train and test dataset

df_train = pd.read_csv('fraudTrain.csv')
df_test = pd.read_csv('fraudTest.csv')
df_train.head()

#### Exploratory Data Analysis

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
# Dropping Unnamed column

df_train.drop("Unnamed: 0",axis=1,inplace=True)
df_test.drop("Unnamed: 0",axis=1,inplace=True)
df_train.head()

In [None]:
# Converting date columns to datetime format

df_train['trans_date_trans_time']=pd.to_datetime(df_train['trans_date_trans_time'])
df_train['trans_date']=df_train['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
df_train['trans_date']=pd.to_datetime(df_train['trans_date'])
df_train['dob']=pd.to_datetime(df_train['dob'])

df_test['trans_date_trans_time']=pd.to_datetime(df_test['trans_date_trans_time'])
df_test['trans_date']=df_test['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
df_test['trans_date']=pd.to_datetime(df_test['trans_date'])
df_test['dob']=pd.to_datetime(df_test['dob'])

df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
# Dropping fields not necessary for model building

drop_cols = ['cc_num','merchant','first','last','street','zip','trans_num','unix_time']

df_train1 = df_train.drop(drop_cols,axis=1)
df_test1 = df_test.drop(drop_cols,axis=1)

df_train1.info()

In [None]:
# Checking Class Imbalance

classes_train=df_train1['is_fraud'].value_counts()
normal_share_train=classes_train[0]/df_train1['is_fraud'].count()*100
fraud_share_train=classes_train[1]/df_train1['is_fraud'].count()*100

classes_test=df_test1['is_fraud'].value_counts()
normal_share_test=classes_test[0]/df_test1['is_fraud'].count()*100
fraud_share_test=classes_test[1]/df_test1['is_fraud'].count()*100

plt.subplot(2,1,1)
plt.bar(['Non-Fraud','Fraud'], classes_train, color=['y','b'])
plt.title('Train')
plt.ylabel('Number of transactions')
plt.annotate("{0:.4}%".format(normal_share_train),(0.2, 0.5), xycoords='axes fraction')
plt.annotate("{0:.4}%".format(fraud_share_train),(0.7, 0.5), xycoords='axes fraction')

plt.subplot(2,1,2)
plt.bar(['Non-Fraud','Fraud'], classes_test, color=['y','b'])
plt.title('Test')
plt.ylabel('Number of transactions')
plt.annotate("{0:.4}%".format(normal_share_test),(0.2, 0.5), xycoords='axes fraction')
plt.annotate("{0:.4}%".format(fraud_share_test),(0.7, 0.5), xycoords='axes fraction')

plt.show()

In [None]:
# Creating field for age on transaction date
df_train1['age_at_trans'] = (df_train1['trans_date'] - df_train1['dob']).dt.days // 365.25
df_test1['age_at_trans'] = (df_test1['trans_date'] - df_test1['dob']).dt.days // 365.25

df_train1.head()
df_train1.info()
df_test1.info()

In [None]:
df_train1.info()

In [None]:
df_test1.info()

In [None]:
fraud_city = pd.DataFrame(df_train1.groupby('city')['is_fraud'].sum())
fraud_city = fraud_city[fraud_city['is_fraud']>0]
fraud_city.index

In [None]:
fraud_state = pd.DataFrame(df_train1.groupby('state')['is_fraud'].sum())
fraud_state = fraud_state[fraud_state['is_fraud']>0]
fraud_state.index

In [None]:
fraud_job = pd.DataFrame(df_train1.groupby('job')['is_fraud'].sum())
fraud_job = fraud_job[fraud_job['is_fraud']>0]
fraud_job.index

In [None]:
# Finding distance from customer location to merchant location in degrees latitude and degrees longitude

df_train1['lat_dist'] = abs(round(df_train1['merch_lat']-df_train1['lat'],3))
df_train1['long_dist'] = abs(round(df_train1['merch_long']-df_train1['long'],3))

df_test1['lat_dist'] = abs(round(df_test1['merch_lat']-df_test1['lat'],3))
df_test1['long_dist'] = abs(round(df_test1['merch_long']-df_test1['long'],3))

df_train1[['merch_lat','lat','lat_dist','merch_long','long','long_dist']].head()

In [None]:
df_train1.info()

In [None]:
# Creating month column

df_train1['trans_month'] = pd.DatetimeIndex(df_train1['trans_date']).month
df_test1['trans_month'] = pd.DatetimeIndex(df_test1['trans_date']).month

df_train1[['trans_date_trans_time','trans_month']].head()

In [None]:
df_train1.info()

In [None]:
df_train1.nunique()

In [None]:
df_test1.nunique()

#### Checking if there are any highly skewed categorical variables

In [None]:
round(df_train1['category'].value_counts()/len(df_train1.index)*100,2)

In [None]:
round(df_train1['gender'].value_counts()/len(df_train1.index)*100,2)

In [None]:
round(df_train1['city'].value_counts()/len(df_train1.index)*100,2)

In [None]:
round(df_train1['state'].value_counts()/len(df_train1.index)*100,2)

In [None]:
round(df_train1['job'].value_counts()/len(df_train1.index)*100,2)

In [None]:
df_train1.info()

In [None]:
df_train1.nunique()

In [None]:
# Dropping variables not useful for model building

drop_cols = ['trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date']

df_train2 = df_train1.drop(drop_cols,axis=1)
df_test2 = df_test1.drop(drop_cols,axis=1)

df_train2.head()

In [None]:
#Binary mapping on Gender
  
df_train2['gender'] = df_train2['gender'].map({'M': 1, 'F': 0})
df_test2['gender'] = df_test2['gender'].map({'M': 1, 'F': 0})

df_train2.head()

In [None]:
df_train2[['amt','city_pop','age_at_trans','lat_dist','long_dist']].describe(percentiles=[.25, .5, .75, .90, .95, .99])

In [None]:
# Creating dummy variables for Category

df_train3 = pd.get_dummies(df_train2,columns=['category'],drop_first=True)
df_test3 = pd.get_dummies(df_test2,columns=['category'],drop_first=True)

df_train3.info()

#### Building Model

In [None]:
# Dropping State

df_train4 = df_train3.drop('state',axis=1)
df_test4 = df_test3.drop('state',axis=1)
df_train4.info()

In [None]:
# Splitting train and test dataset into X and y

X_train = df_train4.drop(['is_fraud'],axis=1)
y_train = df_train4['is_fraud']

X_test = df_test4.drop(['is_fraud'],axis=1)
y_test = df_test4['is_fraud']

#### Oversampling data to compensate for the high class imbalance in the data

In [None]:
# Using ADASYN to oversample

from imblearn.over_sampling import ADASYN
X_train_res, y_train_res = ADASYN().fit_resample(X_train, y_train)

from collections import Counter
print(sorted(Counter(y_train_res).items()))

##### Building a Random Forest Model

In [None]:
# Importing required library

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.model_selection import GridSearchCV


In [None]:
rf = RandomForestClassifier(n_estimators=10, max_depth=5, max_features=12, min_samples_leaf=100, random_state=100, oob_score=True)

In [None]:
rf.fit(X_train_res, y_train_res)

In [None]:
rf.oob_score_

In [None]:
# Predict probabilities
y_train_prob = rf.predict_proba(X_train_res)[:, 1]


In [None]:
# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_train_res, y_train_prob)
roc_auc = auc(fpr, tpr)

In [None]:
# Plotting ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
y_train_pred = rf.predict(X_train_res)
y_train_pred[:10]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train_res, y_train_pred))

#### Predicting on test set

In [None]:
y_test_pred = rf.predict(X_test)
y_test_pred[:10]

In [None]:
print(classification_report(y_test, y_test_pred))

#### Testing model on original data without oversampling

In [None]:
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))

In [None]:
df_train.nunique()

In [None]:
df_train.shape

In [None]:
# Subset of df_train

df_train_merge = df_train[['trans_date_trans_time','amt','trans_num','is_fraud']]
df_train_merge.head()

In [None]:
# Creating dataframe of y_train_pred

df_train_pred = pd.DataFrame(y_train_pred)
df_train_pred.head()

In [None]:
# Merging the two dataframes

df_train_final = pd.concat([df_train_merge, df_train_pred],axis=1)
df_train_final.head()

In [None]:
# Renaming column

df_train_final= df_train_final.rename(columns={ 0 : 'is_fraud_pred'})
df_train_final.head()

In [None]:
df_train_final.groupby('is_fraud')['is_fraud'].count()

In [None]:
df_train_final.groupby('is_fraud_pred')['is_fraud_pred'].count()

In [None]:
# Subset of df_test

df_test_merge = df_test[['trans_date_trans_time','amt','trans_num','is_fraud']]
df_test_merge.head()

In [None]:
# Creating dataframe of y_test_pred

df_test_pred = pd.DataFrame(y_test_pred)
df_test_pred.head()

In [None]:
 # Merging the two dataframes

df_test_final = pd.concat([df_test_merge, df_test_pred],axis=1)
df_test_final.head()

In [None]:
# Renaming column

df_test_final= df_test_final.rename(columns={ 0 : 'is_fraud_pred'})
df_test_final.head()

In [None]:
df_test_final.groupby('is_fraud')['is_fraud'].count()

In [None]:
df_test_final.groupby('is_fraud_pred')['is_fraud_pred'].count()

In [None]:
# Merging train and test dataset for cost benefit analysis

df_merge_final = pd.concat([df_train_final, df_test_final],axis=0)
df_merge_final.head()

In [None]:
df_merge_final.shape

### Cost Benefit Analysis

In [None]:
# Creating month and year columns

df_merge_final['month'] = pd.DatetimeIndex(df_merge_final['trans_date_trans_time']).month
df_merge_final['year'] = pd.DatetimeIndex(df_merge_final['trans_date_trans_time']).year
df_merge_final.head()

In [None]:
# Group by variable creation

g = df_merge_final.groupby(["year","month"])

In [None]:
# Number of transactions per month

g.trans_num.count()

In [None]:
# Average number of transactions per month

df_merge_final.trans_num.count()/24

In [None]:
# Average Number of fraudulent transactions per month

df_merge_final[df_merge_final['is_fraud']==1].trans_num.count()/24

In [None]:
# Average amount per fraud transaction

df_merge_final[df_merge_final['is_fraud']==1].amt.mean()

In [None]:
# Average number of transactions per month detected as fraud by the model

df_merge_final[df_merge_final['is_fraud_pred']==1].trans_num.count()/24

In [None]:
# Average number of transactions per month that are fraudulent but are not detected by the model

Undetected_frauds = df_merge_final.loc[(df_merge_final.is_fraud_pred==0) & (df_merge_final.is_fraud==1)]
Undetected_frauds.trans_num.count()/24