In [28]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np 
%matplotlib inline

from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC  
from sklearn.naive_bayes import GaussianNB

from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,recall_score,classification_report, accuracy_score, precision_score
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('E:/creditcardfraud/creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
countnf = len(data[data["Class"]==0]) 
countf = len(data[data["Class"]==1]) 
percentnf = countnf/(countnf+countf)
print("Normal transacations: ",percentnf*100,"%")
percentf= countf/(countnf+countf)
print("Fraud transacations",percentf*100,"%")

In [30]:
def data_split(data): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    print('Training data: ',X_train.shape)
    print('Testing data: ',X_test.shape)   
    return(X_train,X_test,y_train,y_test)

In [31]:
def undersample(normal_indices,fraud_indices,times):
    Normal_indices_undersample = np.array(np.random.choice(normal_indices,(times*countf),replace=False))
    undersample_data= np.concatenate([fraud_indices,Normal_indices_undersample])
    undersample_data = data.iloc[undersample_data,:]

In [67]:
def train(model,X_train,X_test,y_train,y_test):
    recl=[]
    prel=[]
    fscorel=[]
    print('\n\n\n------------------------------------------------------------------------------')
    print(model)
    model.fit(X_train,y_train.values.ravel())
    pred=model.predict(X_test)
    class_names = ['Normal','Fraud']
    cm=confusion_matrix(y_test,pred)

    df_cm = pd.DataFrame(
        cm, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=(6,3))
    heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right')

    plt.show()
    rec = float(recall_score(y_test, pred))*100
    pre = float(precision_score(y_test, pred))
    print("Recall: ", rec, '%')
    print("Precision: ", pre, '%')

    print("\n----------Classification Report------------------------------------")
    print(classification_report(y_test,pred))

In [33]:
X = data.drop('Class', axis=1)
y = data['Class']
X["Normalized Amount"] = RobustScaler().fit_transform(X['Amount'].values.reshape(-1, 1))
X["Normalized Time"] = RobustScaler().fit_transform(X['Time'].values.reshape(-1, 1))
X.drop(["Time","Amount"],axis=1,inplace=True)

Unbalanced data
--

In [None]:
X_train,X_test,y_train,y_test = data_split(data)

xtt=[LogisticRegression(), GaussianNB()]
for model in xtt:
    train(model,X_train,X_test,y_train,y_test)

Random Undersampling
--

In [None]:
fraud= np.array(data[data.Class==1].index)
normal= np.array(data[data.Class==0].index)

X_train,X_test,y_train,y_test = data_split(data)
ratio=1
USnon_fraud = np.array(np.random.choice(normal,(ratio*countf),replace=False))
USdata= np.concatenate([fraud,USnon_fraud])
USdata = data.iloc[USdata,:]

USX = USdata.drop('Class', axis=1)
USy = USdata['Class'] 
print("\nAfter Undersampling")
USX_train, USX_test, USy_train, USy_test = train_test_split(USX, USy, test_size=0.3, random_state=0)
print('Training data: ',USX_train.shape)
print('Testing data: ',USX_test.shape)  

In [None]:
xtt=[RandomForestClassifier(), LogisticRegression(),    
     knn = KNeighborsClassifier(n_neighbors=i), SVC(kernel='linear', gamma='auto'),
     SVC(kernel='poly', degree=8, gamma='auto'), SVC(kernel='rbf', gamma='auto')]
for model in xtt:
    train(model,USX_train,X_test,USy_train,y_test)

SMOTE
--

In [None]:
os = SMOTE(sampling_strategy=1, random_state=0)
osX,osy=os.fit_sample(X_train,y_train)
osX = pd.DataFrame(data=osX,columns=X_train.columns )
osy= pd.DataFrame(data=osy,columns=["Class"])
print("length of oversampled data is ",len(osX))
print("Number of normal transcation in oversampled data",len(osy[osy["Class"]==0]))
print("No.of fraud transcation",len(osy[osy["Class"]==1]))

In [None]:
xtt=[RandomForestClassifier(), LogisticRegression(),    
     knn = KNeighborsClassifier(n_neighbors=i), SVC(kernel='linear', gamma='auto'),
     SVC(kernel='poly', degree=8, gamma='auto')]
for model in xtt:
    train(model,osX,X_test,osy,y_test)