In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, Normalizer, scale
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as  np

In [2]:
creditcard = pd.read_csv('cleanData.csv')
creditcard.columns = [x.lower() for x in creditcard.columns]
creditcard.rename(columns = {'class' : 'fraud'}, inplace = True)
creditcard

Unnamed: 0,unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v9,...,v10_,v11_,v12_,v14_,v16_,v17_,v18_,v19_,v21_,normal
0,0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.363787,...,0,0,0,0,0,0,0,0,0,1.0
1,1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,-0.255425,...,0,0,0,0,0,0,0,0,0,1.0
2,2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,-1.514654,...,0,0,0,0,1,0,0,0,0,1.0
3,3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,-1.387024,...,0,0,0,0,0,0,0,0,0,1.0
4,4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,0.817739,...,0,0,0,0,0,0,0,0,0,1.0
5,5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,-0.568671,...,0,0,0,0,0,0,0,0,0,1.0
6,6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.464960,...,0,0,0,0,0,0,0,0,0,1.0
7,7,7.0,-0.644269,1.417964,1.074380,-0.492199,0.948934,0.428118,1.120631,0.615375,...,0,0,0,0,0,0,0,0,1,1.0
8,8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,-0.392048,...,0,0,0,0,0,0,0,0,0,1.0
9,9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,-0.736727,...,0,0,0,0,0,0,0,0,0,1.0


In [3]:
creditcard.fraud.value_counts(dropna = False)

0    284315
1       492
Name: fraud, dtype: int64

In [4]:
creditcard.groupby('fraud').amount.mean()

fraud
0     88.291022
1    122.211321
Name: amount, dtype: float64

In [5]:
creditcard.drop('time' , axis = 1, inplace = True)

In [6]:
scaler = StandardScaler()
creditcard['amount'] = scaler.fit_transform(creditcard['amount'].values.reshape(-1,1))

In [9]:
X = creditcard.iloc[:,:-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = 1)

In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth = 6, random_state=0)
clf.fit(Xtrain, Ytrain)
clf.score(Xtrain,Ytrain),clf.score(Xtest,Ytest)

(0.99977989728539984, 0.99970208645876557)

In [11]:
y_train_pred = clf.predict(Xtrain)
y_test_pred = clf.predict(Xtest)

In [12]:
from sklearn.metrics import confusion_matrix

In [13]:
confusion_matrix(ytrain, y_train_pred)

array([[   290,     40],
       [     2, 190488]], dtype=int64)

In [14]:
confusion_matrix(ytest, y_test_pred)

array([[  138,    24],
       [    4, 93821]], dtype=int64)

In [15]:
from sklearn.metrics import classification_report
from  sklearn.metrics import precision_recall_fscore_support


def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum() 
    avg[-1] = total

    class_report_df['avg / total'] = avg

    return class_report_df.T

In [16]:
print(classification_report(y_true=ytest, y_pred=y_test_pred, digits=6))

             precision    recall  f1-score   support

        0.0   0.971831  0.851852  0.907895       162
        1.0   0.999744  0.999957  0.999851     93825

avg / total   0.999696  0.999702  0.999692     93987



In [17]:
clf.score(Xtest,ytest)

0.99970208645876557

In [18]:
clf.predict_proba(Xtest)

array([[  1.46725084e-04,   9.99853275e-01],
       [  1.50440976e-04,   9.99849559e-01],
       [  1.50440976e-04,   9.99849559e-01],
       ..., 
       [  1.17746401e-04,   9.99882254e-01],
       [  1.32130089e-04,   9.99867870e-01],
       [  1.32130089e-04,   9.99867870e-01]])

In [19]:
df_class_report = pandas_classification_report(y_true=ytest, y_pred=y_test_pred)
print(df_class_report)
df_class_report.to_csv('classDT.csv',  sep=',')

             precision    recall  f1-score  support
0             0.971831  0.851852  0.907895    162.0
1             0.999744  0.999957  0.999851  93825.0
avg / total   0.999696  0.999702  0.999692  93987.0


In [20]:
df = pd.DataFrame({'Class':y_test_pred, 'Feature' : Xtest['amount'] })
df.head()

Unnamed: 0,Class,Feature
206218,1.0,-0.317726
21380,1.0,-0.08136
145812,1.0,-0.326202
164815,1.0,-0.325283
268205,1.0,-0.350071


In [21]:
df.to_csv("Predict_RF.csv")