<h1> Logistic Modeling

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.linear_model

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import wrangle as wr
import explore as ex
import modeling as ml

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Base.csv')
df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.4751,11,14,30,0.006991,-1.863101,AB,3483,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0


In [3]:
#Remove Outliers
df = wr.remove_outliers(df, 4)

In [4]:
#Split the Data
train, validate, test = wr.split_data(df)


    train -> (399820, 32)
    validate -> (171352, 32)
    test -> (142794, 32)


In [5]:
train, validate, test = wr.prep_fraud_data(df)
train_scaled, validate_scaled, test_scaled = wr.scale_fraud_data(train, validate, test)
preprocessed_train, preprocessed_validate, preprocessed_test = train_scaled, validate_scaled, test_scaled
X_train, y_train, X_val, y_val, X_test, y_test = ml.break_em_out(preprocessed_train, preprocessed_validate, preprocessed_test)


    train -> (399820, 51)
    validate -> (171352, 51)
    test -> (142794, 51)


In [6]:
# Separate out our X and y values
X_train = train.drop(columns=["fraud_bool"])
y_train = train.fraud_bool

X_validate = validate.drop(columns=["fraud_bool"])
y_validate = validate.fraud_bool

X_test = test.drop(columns=["fraud_bool"])
y_test = test.fraud_bool

<b>Establish a Baseline

In [7]:
# The most frequently observed outcome will be our baseline
train.fraud_bool.value_counts(normalize=True)

0    0.988577
1    0.011423
Name: fraud_bool, dtype: float64

In [8]:
baseline_accuracy = (train.fraud_bool == 0).mean()
round(baseline_accuracy, 4)

0.9886

<b><h3>Model 1 - Top 5 Ranked Features

In [16]:
# Create the logistic regression
logit = LogisticRegression(random_state=123)

# specify the features we're using
features = ['customer_age', 'credit_risk_score', 'proposed_credit_limit','device_os_encoded', 'device_os_windows']

# Fit a model using only these specified features
# logit.fit(X_train[["age", "pclass", "fare"]], y_train)
logit.fit(X_train[features], y_train)

# Since we .fit on a subset, we .predict on that same subset of features
y_pred = logit.predict(X_train[features])
y_pred1 = logit.predict(X_validate[features])
print("Baseline is", round(baseline_accuracy, 2))
print("Logistic Regression using Customer Age, Credit Risk Score, Proposed Credit Limit, Deviced OS Encoded and Device OS Windows")
print('Accuracy of Logistic Regression classifier on training set: {:.4f}'
     .format(logit.score(X_train[features], y_train)))
print('Logit1 model using Customer Age, Credit Risk Score, Proposed Credit Limit, Deviced OS Encoded and Device OS Windows')
print(classification_report(y_validate, y_pred1))

Baseline is 0.99
Logistic Regression using Customer Age, Credit Risk Score, Proposed Credit Limit, Deviced OS Encoded and Device OS Windows
Accuracy of Logistic Regression classifier on training set: 0.9886
Logit1 model using Customer Age, Credit Risk Score, Proposed Credit Limit, Deviced OS Encoded and Device OS Windows
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    169359
           1       0.00      0.00      0.00      1993

    accuracy                           0.99    171352
   macro avg       0.49      0.50      0.50    171352
weighted avg       0.98      0.99      0.98    171352



<b><h3>Model 2 - Top 10 Ranked Features

In [17]:
# Create the logistic regression
logit1 = LogisticRegression(random_state=123)

# specify the features we're using
features = ['income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age','date_of_birth_distinct_emails_4w', 
       'credit_risk_score', 'proposed_credit_limit', 
       'keep_alive_session',
       'device_distinct_emails_8w']

# Fit a model using only these specified features
logit1.fit(X_train[features], y_train)

y_pred = logit1.predict(X_train[features])
y_pred1 = logit1.predict(X_validate[features])
print("Logistic Regression using the Top 10 features")
print('Accuracy of Logistic Regression classifier on training set: {:.4f}'
     .format(logit1.score(X_train[features], y_train)))
print('Logit1 model using using the Top 10 features')
print(classification_report(y_validate, y_pred1))

Logistic Regression using the Top 10 features
Accuracy of Logistic Regression classifier on training set: 0.9886
Logit1 model using using the Top 10 features
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    169359
           1       0.00      0.00      0.00      1993

    accuracy                           0.99    171352
   macro avg       0.49      0.50      0.50    171352
weighted avg       0.98      0.99      0.98    171352



<b>Model 3 - All Features

In [18]:
# All features, all default hyperparameters
logit2 = LogisticRegression(random_state=123)

logit2.fit(X_train, y_train)

y_pred = logit2.predict(X_train)
y_pred1 = logit2.predict(X_validate)
print("Model trained on all features")
print('Accuracy of Logistic Regression classifier on training set: {:.4f}'
     .format(logit2.score(X_train, y_train)))
print("Logit2 model using all features and all model defaults")
print(classification_report(y_validate, y_pred1))

Model trained on all features
Accuracy of Logistic Regression classifier on training set: 0.9886
Logit2 model using all features and all model defaults
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    169359
           1       0.00      0.00      0.00      1993

    accuracy                           0.99    171352
   macro avg       0.49      0.50      0.50    171352
weighted avg       0.98      0.99      0.98    171352



In [19]:
ml.eval_logit_on_Test(train, X_train, y_train, validate, X_val, y_val, test, X_test, y_test, baseline_accuracy)

Logit1 model using Customer Age, Credit Risk Score, Proposed Credit Limit, Deviced OS Encoded and Device OS Windows
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    141133
           1       0.00      0.00      0.00      1661

    accuracy                           0.99    142794
   macro avg       0.49      0.50      0.50    142794
weighted avg       0.98      0.99      0.98    142794

