In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing libraries required 
pd.options.display.max_columns = 100
pd.options.display.max_rows = 900
pd.set_option('float_format' , '{:f}'.format)

import matplotlib 
import matplotlib.pyplot as plt

%matplotlib inline


In [None]:
#reading the two data sets and merging them into a single data frame
df1_1 = pd.read_csv("../input/fraud-detection/fraudTrain.csv")
df1_1 = df1_1.drop(df1_1.columns[0], axis=1)

df1_2 = pd.read_csv("../input/fraud-detection/fraudTest.csv")
df1_2 = df1_2.drop(df1_2.columns[0], axis=1)

df = pd.concat([df1_1,df1_2])
df.head(6)

#we have 22 different features look at the table below

In [None]:
#inspecting data
df.info()

In [None]:
#checking if any of the data in the dataset downloaded had any missing values so that we can skip that row, 
df.isnull().sum()

In [None]:
#to check the shape(rows and colums ) in the data frame
df.shape

we can see that the data set df has features n = 22 and m=1,852,394. n is small and m is very large, using logistic regression or svm without a kernel would help.

Distributing the dependent variables


In [None]:
df_fraud=df[['is_fraud','trans_date_trans_time']].groupby('is_fraud').count().reset_index()
df_fraud.columns=['is_fraud','count']
df_fraud['percentage']=(df_fraud['count']/df_fraud['count'].sum())*100
df_fraud

This is imbalanced class data and we need to balance the dependent variable




In [None]:
#Finiding unique values in each column
df.nunique()

******converting the data type trans_date_trans_time into datetime****

In [None]:
df["trans_date_trans_time"] = pd.to_datetime(df['trans_date_trans_time'])

In [None]:
df.dtypes['trans_date_trans_time']

Derive 'Transaction Hour' Feature from 'Transaction Time' Feature¶****

In [None]:
df['trans_hour'] = df['trans_date_trans_time'].dt.hour
df['trans_hour']

Derive 'Day of Week' Feature from 'Transaction Time' Feature****

In [None]:
df['day_of_week'] = df['trans_date_trans_time'].dt.day_name()
df['day_of_week']

looking the data frame of first 5 data sets by default****

In [None]:
df.head()

deriving year month feature from 'transaction time' feature

In [None]:
df['year_month'] = df['trans_date_trans_time'].dt.to_period('M')
df['year_month']

In [None]:
df.head()


**ploting the bar graphs by distributing the category of transaction and looking at distribution of transactions over different days of a week with 1 row 2 coulums{ subplot(1,2,x) } **

In [None]:
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
df['category'].value_counts().plot.bar();
plt.subplot(1,2,2)
df['day_of_week'].value_counts().plot.bar();

**Sunday and Monday of the week have highest credit card transactions**

**plotting the distribution of no. of transactions over each month and by gender **

In [None]:
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
df['gender'].value_counts().plot.bar();
plt.subplot(1,2,2)
df['year_month'].value_counts().plot.bar();

**Highest number of Transactions are in month of December

Lowest number of Transactions happens in February**

Normalizing the count of users by state

In [None]:
df.state.value_counts(normalize=True)

**normalizing the count of users by profession **

In [None]:
df.job.value_counts(normalize = True , ascending= False)

**finding the age of the customer  and adding it to the dataframe**

In [None]:
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = np.round((df['trans_date_trans_time']- df['dob'])/np.timedelta64(1,'Y'))
df['age']

Looking at the distribution of age

In [None]:
df.age.describe()

50 percentile people are 33-57 year old and youngest person is 14 years old while the oldest is 96 years old

distribution of transaction amount in categories of fraud amount and nonfraud amoung

In [None]:
pd.concat(
[df['amt'].describe(percentiles = [0.5,0.95,0.999])\
.reset_index().rename(columns={'index': 'Row Type', 'amt':'Overall Amt Distribution'}),
df.loc[df['is_fraud']==0,['amt']].describe(percentiles = [0.5,0.95,0.999])\
.reset_index(drop = 1).rename(columns={'amt':'Non-Fraud Amt Distribution'}),
df.loc[df['is_fraud']==1,['amt']].describe(percentiles = [0.5,0.95,0.999])\
.reset_index(drop = 1).rename(columns={'amt':'Fraud Amt Distribution'})],
axis=1
)

fraud trasaction mean ( 67.65) is very high compared to non-fraud transaction mean (530.66)

In [None]:
fig, ax = plt.subplots(1,3,figsize=(20,5))
ax[0].hist(df[df.amt<=1500].amt, bins=50)
ax[1].hist(df[(df.is_fraud==0) & (df.amt<=1500)].amt, bins=50)
ax[2].hist(df[(df.is_fraud==1) & (df.amt<=1500)].amt, bins=50)

ax[0].set_title('Overall Amt Distribution')
ax[1].set_title('Non Fraud Amt Distribution')
ax[2].set_title('Fraud Amt Distribution')

ax[0].set_xlabel('Transaction Amount')
ax[0].set_ylabel('#.of Transactions')

ax[1].set_xlabel('Transaction Amount')
ax[2].set_xlabel('Transaction Amount')
plt.show()

distribution of value of trasaction 

In [None]:
num_cols=['amt']
plt.figure(figsize=[10,10])
for ind, col in enumerate(num_cols):
    plt.subplot(1,2,ind+1)
    df[col].plot.box()
    plt.title(col)
plt.show()

year month vs number of transactions 

In [None]:
df_timeline01 = df.groupby(df['year_month'])[['trans_num','cc_num']].nunique().reset_index()
df_timeline01.columns = ['year_month','num_of_transactions','customers']
df_timeline01

In [None]:
x = np.arange(0,len(df_timeline01),1)

fig, ax = plt.subplots(1,1,figsize=(20,5))
ax.plot(x,df_timeline01['num_of_transactions'])
ax.set_xticks(x)
ax.set_xticklabels(df_timeline01['year_month'])

ax.set_xlabel('Year Month')
ax.set_ylabel('Num of Transactions')
plt.show()


year_month vs fraud trasactions and fraud customers 

In [None]:
df_fraud_transactions = df[df['is_fraud']==1]

df_timeline02 = df_fraud_transactions.groupby(df_fraud_transactions['year_month'])[['trans_num','cc_num']].nunique().reset_index()
df_timeline02.columns = ['year_month','num_of_fraud_transactions','fraud_customers']
df_timeline02

In [None]:
print(df_timeline02['num_of_fraud_transactions'].mean())

distribution of Trasactions by gender

In [None]:
df_gender = df[['gender','trans_num']].groupby(['gender']).count().reset_index()
df_gender.columns = ['Gender','gender_count']

df_gender['percent'] = (df_gender['gender_count']/df_gender['gender_count'].sum())*100

df_gender

**female users are higher than male**

In [None]:
plt.bar(df_gender['Gender'], df_gender['gender_count'], color=(0.2,0.4,1,1),  
        width = 0.4)

plt.show()

grouping the fraud or not by gender

In [None]:
df_fraud_gender = df[['gender','is_fraud','trans_num']].groupby(['gender','is_fraud']).count().reset_index()
df_fraud_gender.columns = ['Gender','is_fraud','count']

df_fraud_gender = df_fraud_gender.merge(df_gender[['Gender','gender_count']],how='inner',\
                                  left_on='Gender',right_on='Gender')


df_fraud_gender['percent_grp'] = (df_fraud_gender['count']/df_fraud_gender['gender_count'])*100


df_fraud_gender

In [None]:
df_category = df[['category','trans_num']].groupby(['category']).count().reset_index()
df_category.columns = ['Category','category_count']

df_category['percent'] = (df_category['category_count']/df_category['category_count'].sum())*100

df_category.sort_values(by = ['percent'], ascending=False).reset_index()
df_category

In [None]:
df_fraud_category = df[['category','is_fraud','trans_num']].groupby(['category','is_fraud']).count().reset_index()
df_fraud_category.columns = ['Category','is_fraud','count']

df_fraud_category = df_fraud_category.merge(df_category[['Category','category_count','percent']],how='inner',\
                                  left_on='Category',right_on='Category')


df_fraud_category['percent_grp'] = (df_fraud_category['count']/df_fraud_category['category_count'])*100

df_fraud_category.sort_values(by = ['category_count'], ascending=False)

grocery and shopping has more number of fraud transactios compared to other categories 

In [None]:
df_fraud=df_fraud_category[df_fraud_category['is_fraud'] == 1].sort_values(by = ['percent_grp'])
df_fraud

normalizing the different merchant counts 

In [None]:
df.merchant.value_counts(normalize=True, ascending=False)

transaction count by percentage to different mechants 

In [None]:
df_merchant = df[['merchant','trans_num']].groupby(['merchant']).count().reset_index()
df_merchant.columns = ['Merchant','merchant_count']

df_merchant['percent'] = (df_merchant['merchant_count']/df_merchant['merchant_count'].sum())*100

df_merchant.sort_values(by = ['percent'], ascending=False)

percentage of frauds distributed over all mearchants 

In [None]:
df_fraud_merchant = df[['merchant','is_fraud','trans_num']].groupby(['merchant','is_fraud']).count().reset_index()
df_fraud_merchant.columns = ['Merchant','is_fraud','count']

df_fraud_merchant = df_fraud_merchant.merge(df_merchant[['Merchant','merchant_count','percent']],how='inner',\
                                  left_on='Merchant',right_on='Merchant')


df_fraud_merchant['percent_grp'] = (df_fraud_merchant['count']/df_fraud_merchant['merchant_count'])*100

In [None]:
df_fraud_merchant[df_fraud_merchant['is_fraud'] == 1].sort_values(by = ['percent_grp'],ascending=False)

**one hot encoding**

In [None]:
category_onehot = pd.get_dummies(df.category, prefix='category', drop_first=True)
gender_onehot = pd.get_dummies(df.gender, prefix='gender', drop_first=True)
day_of_week_onehot = pd.get_dummies(df.day_of_week, prefix='week',drop_first=True)

In [None]:
df1 = pd.concat([df, category_onehot,gender_onehot,day_of_week_onehot], axis=1)

df1.head()

In [None]:
df1.dtypes

In [None]:
df1.columns

In [None]:
df1.index = pd.to_datetime(df1['trans_date_trans_time'])
df1 = df1.rename_axis(index={'trans_date_trans_time': 'time_index'})
df1 = df1.sort_index()
df1.head()

In [None]:
df1['val_for_agg'] = 1

60 days transactions by customer

In [None]:
df_hist_trans_60d = \
    df1 \
    .groupby(['cc_num'])['val_for_agg']\
    .rolling('60D')\
    .count()\
    .shift()\
    .reset_index()\
    .fillna(0)

df_hist_trans_60d.columns = ['cc_num','trans_date','hist_trans_60d']

In [None]:
df_hist_trans_60d['trans_date'] = df_hist_trans_60d['trans_date'].dt.date

In [None]:
df_hist_trans_60d = df_hist_trans_60d.groupby(['cc_num','trans_date'])['hist_trans_60d'].min().reset_index()
df_hist_trans_60d.head()

24 hrs transactions by customer

In [None]:
df_hist_orders_24h = \
    df1 \
    .groupby(['cc_num'])['val_for_agg']\
    .rolling('24H')\
    .count()\
    .shift()\
    .reset_index()\
    .fillna(0)

df_hist_orders_24h.columns = ['cc_num','trans_date_trans_time','hist_trans_24h']
df_hist_orders_24h.head()

merge historic variables with trasactions by cc_num , trans_date_trans_time

In [None]:
df1['trans_date'] = df1['trans_date_trans_time'].dt.date
df2 = df1.merge(df_hist_trans_60d,left_on = ['cc_num','trans_date'], \
          right_on = ['cc_num','trans_date'],how = 'left')


In [None]:
df2.head()

In [None]:
df_job = df[['job','trans_num']].groupby(['job']).count().reset_index()
df_job.columns = ['Job','tran_count_by_job']

df_job['percent'] = (df_job['tran_count_by_job']/df_job['tran_count_by_job'].sum())*100

df_job.sort_values(by = ['percent'], ascending=False)

In [None]:
#transactiob by fraud
df_fraud_job = df[['job','is_fraud','trans_num']].groupby(['job','is_fraud']).count().reset_index()
df_fraud_job.columns = ['Job','is_fraud','count']

df_fraud_job =  df_fraud_job.merge(df_job[['Job','tran_count_by_job','percent']],how='inner',\
                                  left_on='Job',right_on='Job')


df_fraud_job['percent_grp'] = (df_fraud_job['count']/df_fraud_job['tran_count_by_job'])*100

In [None]:
job_plt_data = df_fraud_job.sort_values(by = ["tran_count_by_job"], ascending = False).head(10)
job_plt_data

In [None]:
job_plt_data['label'] = 'Not Fraud'
job_plt_data.loc[job_plt_data['is_fraud']==1,['label']]= 'Fraud'
job_plt_data

In [None]:
#Importing Library
from sklearn.model_selection import train_test_split

oversampling

In [None]:
#As this dataset is highly imbalance we have to balance this by over sampling
cnt_non_fraud = df2[df2['is_fraud'] == 0]['amt'].count()
df2_class_fraud = df2[df2['is_fraud'] == 1]
df2_class_nonfraud = df2[df2['is_fraud'] == 0]
df2_class_fraud_oversample = df2_class_fraud.sample(cnt_non_fraud, replace=True)
df2_oversampled = pd.concat([df2_class_nonfraud, df2_class_fraud_oversample], axis=0)
df2_oversampled.head()

In [None]:
X_cols = ['amt','city_pop', 'trans_hour',
       'age', 'category_food_dining', 'category_gas_transport',
       'category_grocery_net', 'category_grocery_pos',
       'category_health_fitness', 'category_home', 'category_kids_pets',
       'category_misc_net', 'category_misc_pos', 'category_personal_care',
       'category_shopping_net', 'category_shopping_pos', 'category_travel',
       'gender_M','week_Monday','week_Tuesday', 'week_Wednesday','week_Thursday',
        'week_Saturday', 'week_Sunday','hist_trans_60d'] #,X_train, X_test, y_train, y_test = train_test_split(df2_oversampled[X_cols],df2_oversampled[Y_cols] , train_size=0.7, test_size=0.3, random_state=42)
Y_cols = ['is_fraud']

splitting the data into training and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df2_oversampled[X_cols],df2_oversampled[Y_cols] , train_size=0.7, test_size=0.3, random_state=42)
X_train.shape

In [None]:
X_test.shape

****logistic regression***

In [None]:
#Importing Library
from sklearn.linear_model import LogisticRegression

In [None]:
#Building Logistic Regression  Model
logreg = LogisticRegression(random_state=42)


In [None]:
logreg.fit(X_train , y_train)


In [None]:
y_train_pred = logreg.predict(X_train)
y_test_pred = logreg.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

In [None]:
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Logistoic Regression model results:

# Training data:

Accuracy - 84%

recall - 76%

# Testing data:

Accuracy - 84%

recall - 76%