In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_curve, accuracy_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
train = pd.read_csv("../input/creditcardfraud/creditcard.csv")

# Data info

In [3]:
print(train.describe())
train.info()

Luckily there is no null value in our data.

In [4]:
df1 = train.loc[:,['Time','Amount','Class']]
df1.head()

In [5]:
df1.describe()

In [6]:
#Class value classifies clients into
#'normal' and 'fraudenlent'
print(df1.Class.value_counts())

The genuine transaction is way more than fraudenlent ones.\
And the numerical value of 'Time' and 'Amount' variable is not in the best form for classification models.\
Let’s apply scaling techniques on the features “Amount" and "Time” to transform the range of values. We drop the original  columns and add new columns with the scaled values.

In [7]:
rs = RobustScaler()
train1=train
train1['scaled_time'] = rs.fit_transform(train['Time'].values.reshape(-1,1))
train1['scaled_amount'] = rs.fit_transform(train['Amount'].values.reshape(-1,1))
train1.drop(['Time', 'Amount'], axis=1, inplace=True)

In [8]:
train1.describe()

# Models without SMOTE

In [9]:
x = train1.drop(["Class"],axis = 1)
y = train1["Class"]

In [10]:
#70% for training and 30% for testing
(x_train, x_test, y_train, y_test) = train_test_split(x, y, test_size= 0.3, random_state= 42)
print("Shape of train_X: ", x_train.shape)
print("Shape of test_X: ", x_test.shape)

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [12]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
print(classification_report(y_test,y_pred))
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True,cmap="YlGnBu", fmt='g')
plt.ylabel('Actual')
plt.xlabel('Predicted')

The data is imbalanced and the result is bad and lack of accuracy. \
Thus we will apply SMOTE to balance the data.

In [13]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=2)
x_train_s, y_train_s = sm.fit_resample(x_train, y_train)

print(sum(y_train_s==1),sum(y_train_s==0))


In [14]:
dt = DecisionTreeClassifier()
dt.fit(x_train_s, y_train_s)
dt_pred = dt.predict(x_test)
cnf_matrix = confusion_matrix(y_test, dt_pred)
print(classification_report(y_test,dt_pred))
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Random Forest Classifier

In [15]:
rfc = RandomForestClassifier()
rfc.fit(x_train_s,y_train_s)
rfc_pred = rfc.predict(x_test)
print(classification_report(y_test,rfc_pred))

In [16]:
rfc_cnf_matrix = confusion_matrix(y_test, rfc_pred)
sns.heatmap(pd.DataFrame(rfc_cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Neural Network

In [17]:
mlpc = MLPClassifier(hidden_layer_sizes=200,max_iter = 500)
mlpc.fit(x_train_s,y_train_s)
mlpc_pred =mlpc.predict(x_test)

In [18]:
print(classification_report(y_test,mlpc_pred))
rfc_cnf_matrix = confusion_matrix(y_test, mlpc_pred)
sns.heatmap(pd.DataFrame(rfc_cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')