In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
Y = df['Class']
X = df.drop('Class',axis=1)

In [None]:
sns.countplot(Y)

In [None]:
print(X.shape,Y.shape)

In [None]:

from imblearn.combine import SMOTETomek

from sklearn.utils import shuffle

import matplotlib.pyplot as plt
%matplotlib inline

<h3
    >SMOTETmek -- This is a method which basically combine and use both over sampling and under sampling technique for converting the imbalanced data to balanced data</h3>

SMOTETomek method is efficient under a small dataset, and the accuracy of personality recognition is improved by up to around 10%. The results are better than those of previous similar studies. The average accuracies of the plain text dataset and the non-plain text dataset are 75.34% and 78.78%, respectively. The average accuracies of the short text dataset and the long text dataset are 75.34% and 64.25%, respectively.


In [None]:
smote = SMOTETomek(sampling_strategy = 'minority',random_state = 101)

**Using the fit_sample method to convert the imbalanced data into balanced data**

In [None]:
X_new,Y_new = smote.fit_sample(X,Y)

<h3>Printing the balanced data shape which is X_new,Y_new <h3>

In [None]:
print(X_new.shape,Y_new.shape)

Shuffling the data by using the shuffle method which is present in sklearn.utils

In [None]:
X_new,Y_new = shuffle(X_new,Y_new,random_state = 101)

<h2>Checking the fraud and correct card data samples and storing them into different lists. <h2>

In [None]:
fraud_card = X_new[Y_new == 1]
correct_card = X_new[Y_new == 0]

Printing the shape of fraud and correct card data

In [None]:
print(fraud_card.shape,correct_card.shape)

<h3>Doing the Visualisation by using count plot to know count of fraud and non fraud samples present in imbalanced data</h3>

In [None]:
sns.countplot(Y)

<h3>Doing the Visualisation by using count plot to know count of fraud and non fraud samples present in imbalanced data</h3>



In [None]:
sns.countplot(Y_new)

Visualizing the correlation matrix of imbalanced data

In [None]:
fig, ax = plt.subplots(figsize=(10,10))  
sns.heatmap(X.corr(),cmap='plasma')

Visualizing the correlation matrix of balanced data

In [None]:
fig, ax = plt.subplots(figsize=(10,10))  
# sns.set_palette('coolwarm')
sns.heatmap(X_new.corr(),cmap='magma')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix

Splitting the balanced data into training and testing set by using train_test_split method from sklearn.model_selection

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X_new,Y_new,test_size=0.3,random_state = 101)

Printing the shapes of our new training and testing data

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

<strong><h1>Modelling</h1></strong>

<h2>Now Let's Start Building Credit Card Fraud Detection model using Logistic regression<h2>
    
<p>Importing the Logistic Regression from sklearn.linear_models and then create the object of it namely model</p>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score
model = LogisticRegression()

Now Just Fit Your Model with training data by using fit method which takes imput as X and Y

In [None]:
model.fit(X_train,Y_train)

Now Make Prediction by passing a testing data to the predict method

In [None]:
pred = model.predict(X_test)

Now Calculate and Print the Accuracy Score By passing actual label with the predict label to accuracy_score function that you can import from sklearn.metrics.

In [None]:
print(accuracy_score(Y_test,pred))

<h2>Now We are going to plot confusion_matrix and classificaion report to check our model performance</h2>

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
print(classification_report(Y_test,pred))


plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(Y_test,pred))

<h1>As We Will See We Have Got A Much Better Result But We Are Trying To Improve More By Testing Out Various Classification models on it to improve our accuracy</h1>

In [None]:
from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.linear_model import RidgeClassifier,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

<h3>Now we are going to make a function whose parameters are traing and testing data</h3>
<h3>First We Just form the list of tuples in which the tuples consist of two elements first one is Current Classifier Object aand Second is the Model Name</h3>
<h3>Then after that we are going devlop an list of tuples in which first element of tuple represents accuracy of the model , Second element of tuple represents classifier or model name, Third element represents mean_squared_error encountered in the current model </h3>
<h3>Then we just iterate through models list one by one and store the results in the accuracy_list and at the end return the accuracy_list</h3>

In [None]:
def check_various_classifiers(X_train,Y_train,X_test,Y_test):
    models = [  
            
            (ExtraTreesClassifier(),'ExtraTreesClassifier'),
            (GradientBoostingClassifier(),'GradientBoostingClassifier'),
            (RandomForestClassifier(),'RandomForestClassifier'),
            (RidgeClassifier(),'RidgeClassifier'),
            (SGDClassifier(),'SGDClassifier'),
            (LogisticRegression(),'LogisticRegression'),
            (AdaBoostClassifier(),'Ada Boost Classifier'),
            (xgb.XGBClassifier(),'XGBClassifier') 
    ]
    accuracy_list = []
    for (model,model_name) in models:
        curr_model = model
        curr_model.fit(X_train,Y_train)
        pred = curr_model.predict(X_test)
        acc = accuracy_score(Y_test,pred)
        err = mean_squared_error(Y_test,pred)
        accuracy_list.append((acc,model_name,err))
        print(str(acc) + ' '+ str(err))
    return accuracy_list
    

<h2>Now we are going to use our check_various_classifiers function by giving parameters as (X_train,Y_train) -> Training Data , (X_test,Y_test) -> Testing Data</h2>

In [None]:
accuracy_list = check_various_classifiers(X_train,Y_train,X_test,Y_test)

<h2>Now we are sorting our returned list on the basis of accuracy</h2>

In [None]:
accuracy_list.sort(reverse=True,key = lambda x:x[0])

<h3>Now we are going to display the final result in which we are printing the model_name along with it's accuracy and error.</h3>

In [None]:
model_names = []
accuracies = []
error = []
for (acc,model_name,err) in accuracy_list:
    
    model_names.append(model_name)
    
    accuracies.append(acc)
    
    error.append(err)

    
accuracy_list.sort(reverse=True,key = lambda x:x[0])

for (acc,model,loss) in accuracy_list:
    print('Accuracy of '+ str(model)+ ' '+str(acc)+' with loss of '+str(loss))
    print('')

<h2>Now We are going to display the accuracies along with its models using barplots and doing the visualization</h2>

In [None]:
plt.figure(figsize=(10,10))
ax = sns.barplot(model_names,accuracies)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

<h2>Now We are going to display the mean_squared_error along with its models using barplots in seaborn</h2>

In [None]:
plt.figure(figsize=(10,10))
ax = sns.barplot(model_names,error)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

<h2>By Doing The Visualization we can see that best classifier which gives high accuracy is ExtraTreesClassifier with the accuracy of 99.986% and with the loss of 0.0135%<h2>