In [26]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, Normalizer, scale
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as  np

In [27]:
creditcard = pd.read_csv('creditcard.csv')

In [28]:
creditcard.columns = [x.lower() for x in creditcard.columns]
creditcard.rename(columns = {'class' : 'fraud'}, inplace = True)

In [29]:
creditcard.fraud.value_counts(dropna = False)

0    284315
1       492
Name: fraud, dtype: int64

In [30]:
creditcard.groupby('fraud').amount.mean()

fraud
0     88.291022
1    122.211321
Name: amount, dtype: float64

In [31]:
creditcard.drop('time' , axis = 1, inplace = True)

In [32]:
scaler = StandardScaler()
creditcard['amount'] = scaler.fit_transform(creditcard['amount'].values.reshape(-1,1))

In [33]:
X = creditcard.iloc[:,:-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = 1)

In [35]:
clf = DecisionTreeClassifier()
clf.fit(Xtrain, ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [37]:
import pydotplus
dot_data = export_graphviz(clf, out_file = None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("credit2.pdf")

True

In [39]:
y_train_pred = clf.predict(Xtrain)
y_test_pred = clf.predict(Xtest)

In [40]:
from sklearn.metrics import confusion_matrix

In [42]:
confusion_matrix(ytrain, y_train_pred)

array([[190490,      0],
       [     0,    330]], dtype=int64)

In [43]:
confusion_matrix(ytest, y_test_pred)

array([[93778,    47],
       [   39,   123]], dtype=int64)

In [44]:
from sklearn.metrics import classification_report
from  sklearn.metrics import precision_recall_fscore_support


def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum() 
    avg[-1] = total

    class_report_df['avg / total'] = avg

    return class_report_df.T

In [46]:
print(classification_report(y_true=ytest, y_pred=y_test_pred, digits=6))

             precision    recall  f1-score   support

          0   0.999584  0.999499  0.999542     93825
          1   0.723529  0.759259  0.740964       162

avg / total   0.999108  0.999085  0.999096     93987



In [48]:
clf.score(Xtest,ytest)

0.99908497983763711

In [51]:
clf.predict_proba(Xtest)

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [57]:
df_class_report = pandas_classification_report(y_true=ytest, y_pred=y_test_pred)
print(df_class_report)
df_class_report.to_csv('classDT.csv',  sep=',')

             precision    recall  f1-score  support
0             0.999584  0.999499  0.999542  93825.0
1             0.723529  0.759259  0.740964    162.0
avg / total   0.999108  0.999085  0.999096  93987.0


In [56]:
df = pd.DataFrame({'Class':y_test_pred, 'Feature' : Xtest['amount'] })
df.head()

Unnamed: 0,Class,Feature
180970,0,-0.093354
88377,0,0.031706
268802,0,-0.253277
109057,0,-0.349231
213415,0,-0.055892


In [59]:
df.to_csv("Predict_DT.csv")