Train several models and evaluate how effectively they predict instances of fraud using data based on [this dataset from Kaggle](https://www.kaggle.com/dalpozz/creditcardfraud).
 
Each row in `fraud_data.csv` corresponds to a credit card transaction. Features include confidential variables `V1` through `V28` as well as `Amount` which is the amount of the transaction. 
 
The target is stored in the `class` column, where a value of 1 corresponds to an instance of fraud and 0 corresponds to an instance of not fraud.

In [1]:
import numpy as np
import pandas as pd

In [2]:
def percentage_fraud():
    #returns percentage of observations that are fraud
    df = pd.read_csv('fraud_data.csv')
    df.head()
    percent_fraud = len(df[df['Class'] == 1])/len(df[df['Class'] == 0])
    
    return percent_fraud


In [15]:
#creating X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

df = pd.read_csv('fraud_data.csv')

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


In [41]:
def dummy_performance(): #training a dummy classifier and evaluating its performance based on accuracy and recall
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import recall_score
    dc = DummyClassifier(strategy = 'most_frequent')
    dclf = dc.fit(X_train, y_train)
    pred = dclf.predict(X_test)
    acc_score = dc.score(X_test, y_test)
    rec_score = recall_score(y_test, pred, average = 'binary')
    
    
    return (acc_score, rec_score)

In [47]:
def SVC_performance(): #checking accuracy, recall and precision core of SVC classifier
    from sklearn.metrics import recall_score, precision_score
    from sklearn.svm import SVC
    clf = SVC().fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc_score = clf.score(X_test, y_test)
    rec_score = recall_score(y_test, pred, average = 'binary')
    prec_score = precision_score(y_test, pred, average = 'binary')
    return (acc_score, rec_score, prec_score)


In [62]:
def SVC_perfromance_confusion_matrix(): #returns a confusion matrix with SVC performance with parameters 'C' = 1e9 and gamma =  1e-07
    from sklearn.metrics import confusion_matrix
    from sklearn.svm import SVC
    svm = SVC(C = 1e9, gamma = 1e-07)
    clf = svm.fit(X_train, y_train)
    pred = clf.decision_function(X_test) > -220
    matrix = confusion_matrix(y_test, pred)
    return matrix

In [86]:
# training a logistic regression Creating precision recall curve and a roc curve using y_test and the probability estimates for X_test (probability it is fraud).
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn


y_prob_score = LogisticRegression().fit(X_train, y_train).predict_proba(X_test)
clf = LogisticRegression().fit(X_train, y_train)
pred = clf.predict(X_test)
fpr, tpr, _ = roc_curve(y_test, pred)
precision, recall, threshold = precision_recall_curve(y_test, pred)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(precision, recall, label = 'precision-recall curve')
plt.figure()
plt.plot(fpr, tpr, 'g')
plt.show()

In [220]:
def grid_search(): # performing grid search over the parameters listed below for a Logisitic Regression classifier, using recall for scoring and the default 3-fold cross validation
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    param = {'penalty': ['l1', 'l2'],
    'C':[0.01, 0.1, 1, 10, 100]
            }

    log = LogisticRegression().fit(X_train, y_train)
    grid = GridSearchCV(log, param, cv = 3, scoring = 'recall')
    grid.fit(X_train, y_train)
    grid_mean_score = [result.mean_validation_score for result in grid.grid_scores_]
    parameters = [param.parameters for param in grid.grid_scores_]
    l1_score = [grid_mean_score[even] for even in range(len(grid_mean_score)) if even%2 == 0]
    l2_score = [grid_mean_score[odd] for odd in range(len(grid_mean_score)) if odd%2 != 0 ]
    ans = np.array([l1_score, l2_score])
    ans = ans.transpose()
    
    
    return ans
grid_search()



array([[ 0.66666667,  0.76086957],
       [ 0.80072464,  0.80434783],
       [ 0.8115942 ,  0.8115942 ],
       [ 0.80797101,  0.8115942 ],
       [ 0.80797101,  0.80797101]])