In [1]:
import pandas as pd

fraud_test = pd.read_csv('../input/fraud-detection/fraudTest.csv')
fraud_test = fraud_test.drop(fraud_test.columns[0], axis=1)

In [2]:
#number of records from test set which are fraud
frauds_t= fraud_test[fraud_test['is_fraud']==1]

In [3]:
frauds_t.shape                 #This shows us that out 550k records only 2k records are fraud

In [4]:
#Lets checkout the Training Data
fraud_train = pd.read_csv('../input/fraud-detection/fraudTrain.csv')
fraud_train = fraud_train.drop(fraud_train.columns[0], axis=1)

In [5]:
fraud_train.head()

In [6]:
#Printing the number of records which are fraud
print(len(fraud_train[fraud_train['is_fraud']==1]))

In [7]:
#Records from training set which are not fraud
print(len(fraud_train[fraud_train['is_fraud']==0]))

In [8]:
fraud_train.info()

In [9]:
fraud_test.info()

In [10]:
#As from above observations, we can see there are no missing data

In [11]:
fraud_train.hist(bins=50, figsize=(10,12))

In [12]:
fraud_train.describe()

In [13]:
#Removing the label
X_train=pd.concat([fraud_train, fraud_test])

In [14]:
#Exploring the fraud data a little more
fraud=X_train[X_train['is_fraud']==1]

In [15]:
fraud.shape

In [16]:
fraud.hist(bins=50,figsize=(10,12))

In [17]:
import matplotlib.pyplot as plt

plt.hist(fraud['gender'])

In [18]:
correlation_matrix=fraud_train.corr()

In [19]:
correlation_matrix['is_fraud'].sort_values(ascending=False)

In [20]:
#Creating a copy of the dataset
X_train_bk=X_train

In [21]:
X_train['category'].value_counts()                    #As there are only 14 different catagories we can apply One-Hot Encoding on it

In [22]:
fraud['category'].value_counts()

In [23]:
fraud['category'].hist(figsize=(6,6),color='orange',linewidth=1.2,edgecolor='black',bins=14)
plt.xticks(rotation='vertical')
plt.show()

In [24]:
merchant_name=X_train['merchant'].unique()            #There were around 693 merchant so we need to transform this column to numerical value

In [25]:
#Percentage of Merchant data is fraud
fraud_percent_per_merchant=fraud[['merchant','trans_num']].groupby(['merchant']).count().reset_index()
fraud_percent_per_merchant.columns=['merchant','count']

total_percent_per_merchant=X_train[['merchant','trans_num']].groupby(['merchant']).count().reset_index()
total_percent_per_merchant.columns=['merchant','trans_count']

fraud_percent_per_merchant=fraud_percent_per_merchant.merge(total_percent_per_merchant[['merchant','trans_count']],how='inner',\
                                  left_on='merchant',right_on='merchant')

fraud_percent_per_merchant['percent']=fraud_percent_per_merchant['count']/total_percent_per_merchant['trans_count']*100
fraud_percent_per_merchant.sort_values('count',ascending=False)

In [26]:
total_percent_per_merchant                     #There are few merchants which dont show up to contribute to the total fraud data

From above we see that the amount of fraud reported from the merchant is nearly 2% of the actual transactions of the merchant so we can ignore this feature for the time being

In [27]:
#from sklearn.feature_extraction import FeatureHasher

#merchant_hash=FeatureHasher(n_features=10,input_type='string')
#X_train=pd.concat([X_train,pd.DataFrame(merchant_hash.fit_transform(X_train['merchant']).toarray())],axis=1)

In [28]:
X_train.head()

In [29]:
#Percentage of Job data accounting in fraud data - 
fraud_percent_per_job=fraud[['job','trans_num']].groupby(['job']).count().reset_index()
fraud_percent_per_job.columns=['job','count']

total_percent_per_job=X_train[['job','trans_num']].groupby(['job']).count().reset_index()
total_percent_per_job.columns=['job','trans_count']

fraud_percent_per_job=fraud_percent_per_job.merge(total_percent_per_job[['job','trans_count']],how='inner',\
                                  left_on='job',right_on='job')
fraud_percent_per_job['percent']=fraud_percent_per_job['count']/total_percent_per_job['trans_count']*100
fraud_percent_per_job.sort_values('count',ascending=False)

Similar to merchants there are a few job which never contributes to fraud data

In [30]:
X_train['job'].value_counts()

In [31]:
fraud['job'].value_counts()

In [32]:
#Finding out the age of the person involved in the fraud to see how age is impacting the data
from datetime import datetime,date

def age(born):
    born = datetime.strptime(born, "%Y-%m-%d").date()
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))
#fraud['age']=int(datetime.today().strftime('%Y'))-int(fraud['dob'].str[:4].to_numpy())
fraud['age']=fraud['dob'].apply(age)
fraud['age']

In [33]:
fraud['age'].hist()        #From the given chart we can see that 

In [34]:
fraud.age.value_counts()

In [35]:
X_train['age']=X_train.dob.apply(age)

In [36]:
X_train.age.value_counts()

In [37]:
fraud['name']=fraud['first']+' '+fraud['last']
fraud['name'].value_counts()

In [38]:
#X_train=X_train_bk
X_train['last'].value_counts()

In [39]:
#Combining Features job+sex+category
X_train['name']=X_train['first']+' '+X_train['last']
df=[X_train['job'],X_train['gender'],X_train['name']]

In [40]:
print(X_train.groupby(['name','age','gender'])['job'].count())

In [41]:
print(fraud['job'].value_counts())

In [42]:
fraud['job'].value_counts().plot.hist(figsize=(6,5))
plt.xticks(rotation='vertical')
plt.show()

In [43]:
X_train.info()

In [44]:
#Working with Timestamp field
X_train['trans_date_trans_time']=pd.to_datetime(X_train['trans_date_trans_time'])
X_train['hour']=X_train['trans_date_trans_time'].dt.hour

In [45]:
X_train['hour'].value_counts()

In [46]:
import numpy as np
X_train['hour'].plot.hist(rwidth=0.5,figsize=(10,8),bins=24)
plt.xticks(np.arange(0,24))
plt.show()

In [47]:
fraud['trans_date_trans_time']=pd.to_datetime(fraud['trans_date_trans_time'])
fraud['hour']=fraud['trans_date_trans_time'].dt.hour

In [48]:
fraud['hour'].plot.hist(rwidth=0.5,figsize=(10,8),bins=24)
plt.xticks(np.arange(0,24))
plt.show()

In [49]:
#Drawing fraud graph based on Transaction hours and it seems that rate of frauds are high during night or early hours - so we observe a time bound acti

In [50]:
X_train['day']=X_train['trans_date_trans_time'].dt.day_name()

In [51]:
X_train['day'].value_counts()

In [52]:
fraud['day']=fraud['trans_date_trans_time'].dt.day_name()
fraud['day'].value_counts()

In [53]:
fraud['day'].value_counts().plot.bar()
plt.show()

In [54]:
#So the fraud rate varies based on day of the week

In [55]:
#Monthly rate check
X_train['month']=X_train['trans_date_trans_time'].dt.to_period('M')
X_train['month'].value_counts()

In [56]:
fraud['month']=fraud['trans_date_trans_time'].dt.to_period('M')
fraud['month'].value_counts()

In [57]:
fraud['month'].value_counts().plot.bar()
plt.show()

In [58]:
X_train.info()

In [59]:
#X_train=X_train.drop(['dob','name','merchant','merch_lat','merch_long'],axis=1)
#'dob','name','merchant','trans_date_trans_time','merch_lat','merch_long'],axis=1)

In [60]:
#Checking the distribution of column amount over the fraud instances
plt.hist(fraud[fraud['amt']<=1500]['amt'],bins=50)
plt.show()

In [61]:
#Checking the distribution of amount column over all the instances
plt.hist(X_train[X_train['amt']<=1500]['amt'],bins=50)
plt.show()

In [62]:
X_train.head()

In [63]:
#Lets look out how the data is spread out over US
import plotly.express as px
#df = px.data.gapminder()
fig = px.scatter_mapbox(fraud, lat='lat',lon='long',zoom=3, height=500,color='is_fraud')

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
#X_train.plot(kind='scatter',x='lat', y='long',figsize=(6,4))
#plt.show()

In [64]:
#As from the above map we see that more number of fraud are reported from East and parts of Central

In [65]:
#Using lat and long column to identify the location of the fraud, so dropping state, city and other location related fields
#X_train=X_train.drop(['street','city','state','zip'],axis=1)

In [66]:
X_train.head()

In [67]:
#Set the index for the dataset to be trans_date_trans_time
#X_train.reset_index()
#X_train.set_index('trans_date_trans_time')
#X_train=X_train.drop('Unnamed: 0',axis=1)
#X_train

In [68]:
X_train['trans_date_trans_time']=pd.to_datetime(X_train['trans_date_trans_time'])
X_train['trans_date']=X_train['trans_date_trans_time'].dt.date
X_train.info()

In [69]:
#Performing One Hot Encoding on category and day column
#from sklearn.preprocessing import OneHotEncoder
cat_onehot=pd.get_dummies(X_train['category'],prefix='category',drop_first=True)
day_onehot=pd.get_dummies(X_train['day'],prefix='week',drop_first=True)
gender_one=pd.get_dummies(X_train['gender'],prefix='gender',drop_first=True)

In [70]:
X_train=pd.concat([X_train, cat_onehot,gender_one,day_onehot], axis=1)

In [71]:
X_train.index = pd.to_datetime(X_train['trans_date_trans_time'])
X_train = X_train.rename_axis(index={'trans_date_trans_time': 'time_index'})
X_train = X_train.sort_index()
X_train.head()

In [72]:
import warnings
warnings.filterwarnings('ignore')
X_train['val_for_agg'] = 1
X_train_hist_for_60d=X_train.groupby(['cc_num'])['val_for_agg'].rolling('60D').count().shift().reset_index().fillna(0)

X_train_hist_for_60d.columns=['cc_num','trans_date','hist_trans_60d']

In [73]:
X_train_hist_for_60d

In [74]:
X_train_hist_for_60d['trans_date']=X_train_hist_for_60d['trans_date'].dt.date

In [75]:
X_train_hist_for_60d=X_train_hist_for_60d.groupby(['cc_num','trans_date'])['hist_trans_60d'].min().reset_index()

In [76]:
X_train_hist_for_60d.head()

In [77]:
#Checking history transaction with a rolling window of 24hrs
X_train_hist_for_24h=X_train.groupby(['cc_num'])['val_for_agg'].rolling('24H').count().shift().reset_index().fillna(0)
X_train_hist_for_24h.columns=['cc_num','trans_date_trans_time','hist_trans_24h']

In [78]:
X_train_hist_for_24h

In [79]:
fraud.index = pd.to_datetime(fraud['trans_date_trans_time'])
fraud = fraud.rename_axis(index={'trans_date_trans_time': 'time_index'})
fraud = fraud.sort_index()
fraud.head()

In [80]:
#Fraud reported in 24hours
fraud['val_for_agg'] = 1
X_train_fraud_24h=fraud.groupby(['cc_num'])['val_for_agg'].rolling('24H').count().shift().reset_index().fillna(0)
X_train_fraud_24h.columns=['cc_num','trans_date_trans_time','hist_fraud_24h']
X_train_fraud_24h

In [81]:
#Based on the RFM analysis, Recency, Frequency and Monetary Value, categorized the mean of the amount of each credit card for the rolling window of 60days.
X_trans_amt_for_60d=X_train.groupby(['cc_num'])['amt'].rolling('60D').mean().shift(1).reset_index().fillna(0)

X_trans_amt_for_60d.columns=['cc_num','trans_date','hist_tran_amt_60d']
X_trans_amt_for_60d['trans_date']=X_trans_amt_for_60d['trans_date'].dt.date
X_trans_amt_for_60d=X_trans_amt_for_60d.groupby(['cc_num','trans_date'])['hist_tran_amt_60d'].min().reset_index()
X_trans_amt_for_60d

In [82]:
#Measure distance between lat,long and merchant_lat, merchant_long
#To accomplish this, we use the haversine distance -It determines the great-circle distance between two points on a sphere given their longitudes and latitudes.
def haversine_dist(lat1,long1,lat2,long2):
    lat1,long1,lat2,long2=np.radians([lat1,long1,lat2,long2])        #Converts angle to radian
    earth_radius=6371                                             #in kms
    
    h = np.sin((lat2-lat1)/2.0)**2 + np.cos(long1)*np.cos(long2)*np.sin((long2-long1)/2.0)**2               
    return earth_radius * 2 * np.arcsin(np.sqrt(h))                                                           #arcsine - Inverse of a sine function

In [83]:
X_train['dist']=haversine_dist(X_train['lat'],X_train['long'],X_train['merch_lat'],X_train['merch_long'])

In [84]:
#Merge the aggregated data in X_train
X_train_sampled=X_train.merge(X_train_hist_for_60d,left_on = ['cc_num','trans_date'], right_on = ['cc_num','trans_date'], how='left')
X_train_sampled=X_train_sampled.merge(X_train_hist_for_24h,left_on = ['cc_num','trans_date_trans_time'], right_on = ['cc_num','trans_date_trans_time'], how='left')
X_train_sampled=X_train_sampled.merge(X_train_fraud_24h,left_on = ['cc_num','trans_date_trans_time'], right_on = ['cc_num','trans_date_trans_time'], how='left')
X_train_sampled=X_train_sampled.merge(X_trans_amt_for_60d,left_on = ['cc_num','trans_date'], right_on = ['cc_num','trans_date'], how='left')

In [85]:
X_train_sampled['dist']=X_train_sampled['dist'].fillna(0)

In [86]:
day_onehot=day_onehot.reset_index()
day_onehot

In [120]:
#Features to be used for Modelling
features = ['amt','city_pop','age','hour','category_food_dining', 'category_gas_transport',
    'category_grocery_net', 'category_grocery_pos','category_health_fitness', 'category_home', 'category_kids_pets',
    'category_misc_net', 'category_misc_pos', 'category_personal_care','category_shopping_net', 'category_shopping_pos', 'category_travel',
    'week_Monday', 'week_Saturday', 'week_Sunday', 'week_Thursday','week_Tuesday', 'week_Wednesday','gender_M','dist','hist_trans_60d','hist_trans_24h',
    'hist_fraud_24h','hist_tran_amt_60d']

In [121]:
X_train_sampled[['hist_trans_60d','hist_trans_24h','hist_fraud_24h','hist_tran_amt_60d']]=X_train_sampled[['hist_trans_60d','hist_trans_24h','hist_fraud_24h','hist_tran_amt_60d']].fillna(0)

In [122]:
X_train_sampled.info()

In [123]:
non_fraud=X_train_sampled[X_train_sampled['is_fraud']==0]
non_fraud.head()

In [124]:
fraud=X_train_sampled[X_train_sampled['is_fraud']==1]

In [125]:
#As the fraud data percentage is too low, we need to resample the data - 2 approaches are there either to oversample or undersample the data
count_of_non_fraud=len(non_fraud)
X_train_fraud_oversample=fraud.sample(count_of_non_fraud,replace=True)
X_train_fraud_oversample
X_train_oversampled=pd.concat([non_fraud,X_train_fraud_oversample],axis=0)

In [126]:
#Percent of frauds in total transaction data -
fraud_percent=len(fraud)/len(X_train_bk)*100
fraud_percent

In [127]:
print('Random Sampling : ',X_train_oversampled['is_fraud'].value_counts())

In [128]:
#X_train_oversampled.index()
final_df=X_train_oversampled[features]
final_df
#                'week_Monday', 'week_Saturday', 'week_Sunday', 'week_Thursday', 'week_Tuesday', 'week_Wednesday', 'gender_M'])
    
#['category_food_dining', 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets', 'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net', 'category_shopping_pos', 'category_travel', 
#                'week_Monday', 'week_Saturday', 'week_Sunday', 'week_Thursday', 'week_Tuesday', 'week_Wednesday', 'gender_M']]=
#np.int64(X_train_oversampled[['category_food_dining', 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets', 'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net', 'category_shopping_pos', 'category_travel', 'week_Monday', 'week_Saturday', 'week_Sunday', 'week_Thursday', 'week_Tuesday', 'week_Wednesday', 'gender_M']])
#X_train_oversampled[features].head()

Hence we see that now both has equal number of data

In [129]:
#Splitting the dataset into train and test
from sklearn.model_selection import train_test_split 

X_train_os,X_test_os,y_train_os,y_test_os = train_test_split(X_train_oversampled[features],X_train_oversampled['is_fraud'],train_size=0.7,
                                                            test_size=0.3,random_state=42)

In [130]:
#Exploring the train dataset -
X_train_os.shape

In [131]:
X_test_os.shape

In [132]:
#Applying Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg=LogisticRegression(random_state=42)
log_reg.fit(X_train_os,y_train_os)

In [133]:
y_train_pred=log_reg.predict(X_train_os)
y_test_pred=log_reg.predict(X_test_os)

In [135]:
#Evaluate model performance
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
print(confusion_matrix(y_train_os,y_train_pred))
print(classification_report(y_train_os,y_train_pred))

In [143]:
print(confusion_matrix(y_test_os,y_test_pred))
print(classification_report(y_test_os,y_test_pred))

In [142]:
#Applying Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

tree_clf=DecisionTreeClassifier(max_depth=10,random_state=42)
tree_clf.fit(X_train_os,y_train_os)

In [145]:
y_train_pred_tree=tree_clf.predict(X_train_os)
y_test_pred_tree=tree_clf.predict(X_test_os)

In [147]:
#Evaluation of tree model
print(confusion_matrix(y_train_os,y_train_pred_tree))
print(classification_report(y_train_os,y_train_pred_tree))

In [148]:
print(confusion_matrix(y_test_os,y_test_pred_tree))
print(classification_report(y_test_os,y_test_pred_tree))

In [152]:
#Applying RandomForest
from sklearn.ensemble import RandomForestClassifier

forest_clf=RandomForestClassifier(n_estimators=30,max_depth=10,random_state=42)
forest_clf.fit(X_train_os,y_train_os)

In [154]:
y_train_pred_forest=forest_clf.predict(X_train_os)
y_test_pred_forest=forest_clf.predict(X_test_os)

In [155]:
print(confusion_matrix(y_train_os,y_train_pred_forest))
print(classification_report(y_train_os,y_train_pred_forest))

In [156]:
print(confusion_matrix(y_test_os,y_test_pred_forest))
print(classification_report(y_test_os,y_test_pred_forest))