In [None]:
!pip install xgboost

In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 100)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import xgboost as xgb

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")
from xgboost import XGBClassifier

In [None]:
file_path = 'E:\Capstone DS\Credit Card fraud\creditcard1.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
print(df.dtypes)
print(df.info())

In [None]:
total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

In [None]:
classes=df['Class'].value_counts()
normal_share=round(classes[0]/df['Class'].count()*100,2)
fraud_share=round(classes[1]/df['Class'].count()*100, 2)
normal_share, fraud_share

In [None]:
plt.figure(figsize=(15,7.5))

plt.subplot(1,2,1)
sns.countplot(df["Class"])
plt.ylabel("Number of transaction")
plt.xlabel("Class")
plt.title("Credit Card Fraud Class - data unbalance")

plt.subplot(1,2,2)
fraud_percentage = {'Class':['Non-Fraudulent', 'Fraudulent'], 'Percentage':[normal_share, fraud_share]} 
df_fraud_percentage = pd.DataFrame(fraud_percentage) 
sns.barplot(x='Class',y='Percentage', data=df_fraud_percentage)
plt.title('Percentage of fraudulent vs non-fraudulent transcations')

plt.show()

In [None]:
sns.scatterplot(df["Time"], df["Class"])
plt.title("Time vs Class scatter plot")
plt.show()

In [None]:
sns.scatterplot(df["Amount"], df["Class"])
plt.title("Amount vs Class scatter plot")
plt.show()

In [None]:
df = df.drop("Time", axis = 1)

In [None]:
var = list(df.columns.values)
var.remove("Class")
i = 0
t0 = df.loc[df['Class'] == 0]
t1 = df.loc[df['Class'] == 1]

plt.figure()
fig, ax = plt.subplots(8,4,figsize=(16,28))

for feature in var:
    i += 1
    plt.subplot(8,4,i)
    sns.kdeplot(t0[feature], bw=0.5,label="0")
    sns.kdeplot(t1[feature], bw=0.5,label="1")
    plt.xlabel(feature, fontsize=12)
    locs, labels = plt.xticks()
    plt.tick_params(axis='both', which='major', labelsize=12)
plt.show();

In [None]:
y= df["Class"]
X = df.drop("Class", axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
print(np.sum(y))
print(np.sum(y_train))
print(np.sum(y_test))

In [None]:
scaler = StandardScaler()
X_train[["Amount"]] = scaler.fit_transform(X_train[["Amount"]])
X_test[["Amount"]] = scaler.transform(X_test[["Amount"]])

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
var = X_train.columns
plt.figure(figsize=(20,15))
i=0
for col in var:
    i += 1
    plt.subplot(5,6, i)
    sns.distplot(X_train[col])

plt.show()

In [None]:
var = X_train.columns
skew_list = []
for i in var:
    skew_list.append(X_train[i].skew())

tmp = pd.concat([pd.DataFrame(var, columns=["Features"]), pd.DataFrame(skew_list, columns=["Skewness"])], axis=1)
tmp.set_index("Features", inplace=True)
tmp.T

In [None]:
skewed = tmp.loc[(tmp["Skewness"] > 1) | (tmp["Skewness"] <-1 )].index
print(skewed)

In [None]:
pt = PowerTransformer(copy=False)
X_train[skewed] = pt.fit_transform(X_train[skewed])
X_test[skewed] = pt.transform(X_test[skewed])

In [None]:
var = X_train.columns

plt.figure(figsize=(20,15))
i=0
for col in var:
    i += 1
    plt.subplot(5,6, i)
    sns.distplot(X_train[col])

plt.show()

In [None]:
y_train.value_counts()/y_train.shape

In [None]:
params = {"C": [0.01, 0.1, 1, 10, 100, 1000]}
folds = KFold(n_splits=5, shuffle=True, random_state=4)
model_cv = GridSearchCV(estimator = LogisticRegression(),
                        param_grid = params, 
                        scoring= 'roc_auc', 
                        cv = folds, 
                        n_jobs=-1,
                        verbose = 1,
                        return_train_score=True) 
model_cv.fit(X_train, y_train)
print('Best ROC AUC score: ', model_cv.best_score_)
print('Best hyperparameters: ', model_cv.best_params_)

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(cv_results['param_C'], cv_results['mean_test_score'])
plt.plot(cv_results['param_C'], cv_results['mean_train_score'])
plt.xlabel('C')
plt.ylabel('sensitivity')
plt.legend(['test result', 'train result'], loc='upper left')
plt.xscale('log')

In [None]:
log_reg_imb_model = LogisticRegression(C=0.01)
log_reg_imb_model.fit(X_train, y_train)

In [None]:
def display_scores(y_test, y_pred):
    '''
    Display ROC-AUC score, f1 score and classification report of a model.
    '''
    print(f"F1 Score: {round(f1_score(y_test, y_pred)*100,2)}%") 
    print(f"Classification Report: \n {classification_report(y_test, y_pred)}")

In [None]:
y_train_pred = log_reg_imb_model.predict(X_train)

In [None]:
display_scores(y_train, y_train_pred)

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
y_train_pred_proba = log_reg_imb_model.predict_proba(X_train)[:,1]

In [None]:
draw_roc(y_train, y_train_pred_proba)

In [None]:
y_test_pred = log_reg_imb_model.predict(X_test)
display_scores(y_test, y_test_pred)

In [None]:
y_test_pred_proba = log_reg_imb_model.predict_proba(X_test)[:,1]

In [None]:
draw_roc(y_test, y_test_pred_proba)

In [None]:
param_grid = {
    'max_depth': range(5, 15, 5),
    'min_samples_leaf': range(50, 150, 50),
    'min_samples_split': range(50, 150, 50),}
dtree = DecisionTreeClassifier()

grid_search = GridSearchCV(estimator = dtree, 
                           param_grid = param_grid, 
                           scoring= 'roc_auc',
                           cv = 3, 
                           n_jobs=-1,
                           verbose = 1)
grid_search.fit(X_train,y_train)

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results

In [None]:
print("Best roc auc score : ", grid_search.best_score_)
print(grid_search.best_estimator_)
40