# Logistic Regression

content

1. theory

    - model
    - when to use
    - assumptions
    - goodness of fit
    - assymptotics
    - pros and cons
    - alternatives

2. computational example "creditcard_fraud"

    - describe Dataset
    - load and prepare data
    - define functions
    - computation
    - goodness of fit

3. business cases

## 2. computational example "creditcard_fraud"

The dataset provides infromation of creditcard holders with a dummy variable for fraud (Class). The original information has been mapped to principal components by PCA for privacy reasons.

Can we predict the case of fraud out of these principal components?

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
% matplotlib inline
from matplotlib import mlab

In [2]:
df = pd.read_csv("../data/creditcard_fraud_subsample.csv", sep = ';')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,129095.0,-1.83694,-1.646764,-3.381168,0.473354,0.074243,-0.446751,3.791907,-1.351045,0.095186,...,0.010663,1.786681,-0.151178,-0.582098,-0.956062,-0.334369,0.7156,0.37045,720.8,1
1,69394.0,1.140431,1.134243,-1.429455,2.012226,0.6228,-1.152923,0.221159,0.037372,0.034486,...,-0.367136,-0.891627,-0.160578,-0.108326,0.668374,-0.352393,0.071993,0.113684,1.0,1
2,148476.0,-1.125092,3.682876,-6.556168,4.016731,-0.425571,-2.03121,-2.650137,1.131249,-2.94689,...,1.18558,1.348156,-0.053686,0.284122,-1.174469,-0.087832,0.71879,0.676216,0.76,1
3,48533.0,1.243848,0.524526,-0.538884,1.209196,0.479538,-0.197429,0.049166,0.037792,0.128119,...,-0.05166,-0.084089,-0.192846,-0.917392,0.681953,-0.194419,0.045917,0.040136,1.0,1
4,154493.0,-7.381547,-7.449015,-4.696287,3.728439,6.198304,-6.406267,-5.831452,1.457175,-0.646203,...,1.176575,-0.978692,-0.27833,-0.635874,0.123539,0.404729,0.704915,-1.229992,35.0,1


Remove the feature 'Time' from the dataset and standardize the feature 'Amount' by substracting the mean and scaling to unit variance.

In [3]:
df = df.drop(['Time'], axis=1)
df['Amount_norm'] = (df.Amount-df.Amount.mean())/df.Amount.std()
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,Amount_norm
0,-1.83694,-1.646764,-3.381168,0.473354,0.074243,-0.446751,3.791907,-1.351045,0.095186,-0.0845,...,1.786681,-0.151178,-0.582098,-0.956062,-0.334369,0.7156,0.37045,720.8,1,2.625216
1,1.140431,1.134243,-1.429455,2.012226,0.6228,-1.152923,0.221159,0.037372,0.034486,-1.879644,...,-0.891627,-0.160578,-0.108326,0.668374,-0.352393,0.071993,0.113684,1.0,1,-0.36063
2,-1.125092,3.682876,-6.556168,4.016731,-0.425571,-2.03121,-2.650137,1.131249,-2.94689,-4.816401,...,1.348156,-0.053686,0.284122,-1.174469,-0.087832,0.71879,0.676216,0.76,1,-0.361625
3,1.243848,0.524526,-0.538884,1.209196,0.479538,-0.197429,0.049166,0.037792,0.128119,-0.552903,...,-0.084089,-0.192846,-0.917392,0.681953,-0.194419,0.045917,0.040136,1.0,1,-0.36063
4,-7.381547,-7.449015,-4.696287,3.728439,6.198304,-6.406267,-5.831452,1.457175,-0.646203,-4.029129,...,-0.978692,-0.27833,-0.635874,0.123539,0.404729,0.704915,-1.229992,35.0,1,-0.219592


In [4]:
def train_test_fn(df , proportion):

    msk = np.random.rand(len(df)) < proportion

    train = df[msk]

    test = df[~msk]
    
    return train, test

def frange(start, stop, step):
     i = start
     while i < stop:
         yield i
         i += step
        
def confusion_matrix_fn(truth,result):
    c11 = sum(truth*result)
    c10 = sum(truth*(1-result))
    c01 = sum((1-truth)*result)
    c00 = sum((1-truth)*(1-result))
    tab = np.matrix([[c11, c10], [c01, c00]])
    return tab

def precision_fn(truth,result):
    c11 = sum(truth*result)
    #c10 = sum(truth*(1-result))
    c01 = sum((1-truth)*result)
    #c00 = sum((1-truth)*(1-result))
    prec = c11/(c11+c01)
    return prec

def recall_fn(truth,result):
    c11 = sum(truth*result)
    c10 = sum(truth*(1-result))
    #c01 = sum((1-truth)*result)
    #c00 = sum((1-truth)*(1-result))
    rec = c11/(c11+c10)
    return rec

def F_score_fn(truth,result):
    c11 = sum(truth*result)
    c10 = sum(truth*(1-result))
    c01 = sum((1-truth)*result)
    prec = c11/(c11+c01)
    rec = c11/(c11+c10)
    F=2/(1/prec+1/rec)
    
    return F

In [None]:
cols = []
for i in range(29):
    cols.append('beta' + str(i+1))

In [5]:
accuracies = []
precisions = []
recalls = []
F_scores = []

proportion = 0.6
iterations = 100

for i in range(iterations-1):
    train, test = train_test_fn(df, proportion) 
    X = train.drop(['Amount','Class'], axis=1)
    y = train['Class']
    
    logisticRegr = LogisticRegression(penalty = 'l2' ,solver = 'liblinear')
    logistic_model = logisticRegr.fit(X, y) 
    
    y_predicted = logistic_model.predict(X_test)
    accuracies.append(np.mean(np.equal(y_predicted,y_test))) 
    precisopns.append(precision_fn(y_predicted,y_test))
    recalls.append(recall_fn(y_predicted,y_test))
    F_scores.append(F_score_fn(y_predicted,y_test))
    

NameError: name 'X_test' is not defined

In [None]:
coeffs

In [None]:

plt.figure(1,figsize=(15,5))
plt.hist(accuracies , normed = 1)
mu = np.mean(accuracies)
sigma = np.std(accuracies)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
plt.plot(x,mlab.normpdf(x, mu, sigma) , color="black")
plt.title("accuracies histogram and normal distribution")
plt.xlabel("accuracy")
plt.ylabel("frequency")
plt.show()

In [None]:
plt.figure(1,figsize=(15,5))
plt.scatter(accuracies, coeffs.beta1)
plt.title("beta1 estimates against model accuracy")
plt.xlabel("accuracy")
plt.ylabel("beta1 estimaes")
plt.show()

In [None]:
#coeffs = pd.DataFrame([])
accuracies = []
proportions = []

proportion = 0.1
iterations = 100

for proportion in frange(0.05,0.9,0.05):

    for i in range(iterations-1):
        train, test = train_test_fn(df, proportion) 
        X = train.drop(['Amount','Class'], axis=1)
        y = train['Class']
    
        logisticRegr = LogisticRegression(penalty = 'l2' ,solver = 'liblinear')
        logistic_model = logisticRegr.fit(X, y) 
    
        #coeffs = coeffs.append(pd.DataFrame(logistic_model.coef_, columns=cols))
        accuracies.append(np.mean(np.equal(logistic_model.predict(X_test),y_test))) 
        proportions.append(proportion)

qualities = pd.DataFrame(list(zip(proportions,accuracies)), columns=['proportions','accuracies'])

In [None]:
qualities

In [None]:
plt.figure(1,figsize=(15,5))
plt.scatter(qualities.proportions, qualities.accuracies)
plt.title("accuracies against proportions")
plt.xlabel("proportion")
plt.ylabel("accuracy")
plt.show()

In [None]:
matrix_logistic = confusion_matrix_fn(logistic_model.predict(X_test),y_test)
matrix_logistic

In [None]:
precision_fn(logistic_model.predict(X_test),y_test)
recall_fn(logistic_model.predict(X_test),y_test)
F_score_fn(logistic_model.predict(X_test),y_test)

In [None]:
recall_fn(logistic_model.predict(X_test),y_test) # logistische Regression mit Formel beurteilen

In [None]:
F_score_fn(logistic_model.predict(X_test),y_test) # logistische Regression mit Formel beurteilen

In [None]:
import matplotlib.pyplot as plt
from scipy import integrate

def capcurve(y_values, y_preds_proba):
    num_pos_obs = np.sum(y_values)
    num_count = len(y_values)
    rate_pos_obs = float(num_pos_obs) / float(num_count)
    ideal = pd.DataFrame({'x':[0,rate_pos_obs,1],'y':[0,1,1]})
    xx = np.arange(num_count) / float(num_count - 1)

    y_cap = np.c_[y_values,y_preds_proba]
    y_cap_df_s = pd.DataFrame(data=y_cap)
    y_cap_df_s = y_cap_df_s.sort_values([1], ascending=False).reset_index('index', drop=True)

    print(y_cap_df_s.head(20))

    yy = np.cumsum(y_cap_df_s[0]) / float(num_pos_obs)
    yy = np.append([0], yy[0:num_count-1]) #add the first curve point (0,0) : for xx=0 we have yy=0

    percent = 0.5
    row_index = np.trunc(num_count * percent)

    val_y1 = yy[row_index]
    val_y2 = yy[row_index+1]
    if val_y1 == val_y2:
        val = val_y1*1.0
    else:
        val_x1 = xx[row_index]
        val_x2 = xx[row_index+1]
        val = val_y1 + ((val_x2 - percent)/(val_x2 - val_x1))*(val_y2 - val_y1)

    sigma_ideal = 1 * xx[num_pos_obs - 1 ] / 2 + (xx[num_count - 1] - xx[num_pos_obs]) * 1
    sigma_model = integrate.simps(yy,xx)
    sigma_random = integrate.simps(xx,xx)

    ar_value = (sigma_model - sigma_random) / (sigma_ideal - sigma_random)
    #ar_label = 'ar value = %s' % ar_value

    fig, ax = plt.subplots(nrows = 1, ncols = 1)

    ax.plot(ideal['x'],ideal['y'], color='grey', label='Perfect Model')
    ax.plot(xx,yy, color='red', label='User Model')
    #ax.scatter(xx,yy, color='red')
    ax.plot(xx,xx, color='blue', label='Random Model')
    ax.plot([percent, percent], [0.0, val], color='green', linestyle='--', linewidth=1)
    ax.plot([0, percent], [val, val], color='green', linestyle='--', linewidth=1, label=str(val*100)+'% of positive obs at '+str(percent*100)+'%')

    plt.xlim(0, 1.02)
    plt.ylim(0, 1.25)
    plt.figure(1,figsize=(15,5))
    plt.title("CAP Curve - a_r value ="+str(ar_value))
    plt.xlabel('% of the data')
    plt.ylabel('% of positive obs')
    plt.legend()
    plt.show()

In [None]:
#fitting the classifier to the training set
from sklearn.ensemble import RandomForestClassifier
classifier = LogisticRegression(penalty = 'l2' ,solver = 'liblinear')
classifier.fit(X, y) 


In [None]:
#Then we create the CAP Curve with the following code :

y_pred_proba = classifier.predict_proba(X=X_test)
capcurve(y_values=y_test, y_preds_proba=y_pred_proba[:,1])

In [None]:
sklearn.metrics.roc_curve(y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=True)

In [None]:
sklearn.metrics.roc_curve(y_test, logistic_model.predict(X_test), pos_label=None, sample_weight=None, drop_intermediate=True)

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = logistic_model.predict(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
