# Logistic Regression

Logistic regression is a machine learning model used to make predictions according to linear function of the input features. Can be used for both regression and classification but in this case we willbe using it as a classifier

In [1]:
# Import Data
# MUTATE DATAFRAMES ACCORDING TO THE EXPLORATORY DATA ANALYSIS CODE

#For data Manipulation
import numpy as np
import pandas as pd
#In order to show all columns available
pd.set_option('display.max_columns', 200)

#Sklearn imports
from sklearn.preprocessing import LabelEncoder, Imputer

#Graphing libs
import matplotlib.pyplot as plt
import seaborn as sns

apptrain = pd.read_csv('../Dataset/application_train.csv')
apptest = pd.read_csv('../Dataset/application_test.csv')

# Code that modifies dataframes
apptrain['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
apptest['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
apptrain['DAYS_BIRTH'] = abs(apptrain['DAYS_BIRTH'])
apptest['DAYS_BIRTH'] = abs(apptrain['DAYS_BIRTH'])

FileNotFoundError: File b'../Dataset/application_train.csv' does not exist

# Prepare the Data

In [2]:
# One-hot encoding and dataframe alignment
le = LabelEncoder()
le_count = 0

# Iterate through columns
for col in apptrain:
    if apptrain[col].dtype == "object":
        if len(list(apptrain[col].unique())) <= 2:
            #train on the training data
            le.fit(apptrain[col])
            #transform both training and testing data
            apptrain[col] = le.transform(apptrain[col])
            apptest[col] = le.transform(apptest[col])
            
            le_count += 1
            

            
#One-Hot encoding
apptrain = pd.get_dummies(apptrain)
apptest = pd.get_dummies(apptest)



print('Training features shape: {}'.format(apptrain.shape))
print('Training features shape: {}'.format(apptest.shape))
print('{} columns were label encoded'.format(le_count))

Training features shape: (307511, 243)
Training features shape: (48744, 239)
3 columns were label encoded


In [3]:
# Take the labels out of the training dataset as an inner merge will erase them since the test dataset does not have the targets
train_labels = apptrain['TARGET']


#aligning the training and testing data, keep only columns present in both df's
apptrain, apptest = apptrain.align(apptest, join = 'inner', axis = 1)
apptrain['TARGET'] = train_labels

print('Training Features shape: ', apptrain.shape)
print('Testing features shape: ', apptest.shape)
print("We're back on track, remember the training dataset will have one column more since it DOES have the targets")

Training Features shape:  (307511, 240)
Testing features shape:  (48744, 239)
We're back on track, remember the training dataset will have one column more since it DOES have the targets


In [4]:
# Extra Dependencies
from sklearn.preprocessing import MinMaxScaler

# Creating base df's for machine learning model
training_data = apptrain.drop(columns = ['TARGET'])
testing_data = apptest.copy()

# In the dataframes we still have missing values, WE USE IMPUTATION HERE
imputer = Imputer(strategy = 'median')
imputer.fit(training_data)
imputer.fit(testing_data)
training_data = imputer.transform(training_data)
testing_data = imputer.transform(testing_data)


# We scale our data
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(training_data)
training_data_scaled = scaler.transform(training_data)
testing_data_scaled = scaler.transform(testing_data)


print('training data shape', training_data_scaled.shape)
print('testing data shape', testing_data_scaled.shape)

training data shape (307511, 239)
testing data shape (48744, 239)


# LOGISTIC REGRESSION MODEL (BASE FEATURES)
The data is ready to be sent into the logistic regression model

In [5]:
from sklearn.linear_model import LogisticRegression

# Develop model with parameters
logistic_regression = LogisticRegression(C=0.0001, verbose=2, n_jobs=-1)

#train the model by giving it the data
logistic_regression.fit(training_data_scaled, train_labels)

  " = {}.".format(self.n_jobs))


[LibLinear]

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=2, warm_start=False)

In [6]:
logistic_regression_predictions = logistic_regression.predict_proba(testing_data_scaled)[:, 1]

In [9]:
#Let's make a function that'll format and save our predictions for submissions to the Kaggle competition
def format_and_submit(predictions, desired_file_name):
    submit = apptest[['SK_ID_CURR']]
    submit['TARGET'] = predictions
    submit.to_csv('../Model_Predictions/{}.csv'.format(desired_file_name))

In [10]:
format_and_submit(logistic_regression_predictions, "Logistic_Regression_Untuned")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


# Using manually engineered features.

In [9]:
# Import our dataframe with the most useful features
fe_training_data = pd.read_csv('rf_important_features.csv')
fe_testing_data = pd.read_csv('rftest_important_features.csv')

train_labels =  fe_training_data['TARGET']
fe_training_data = fe_training_data.drop(columns = 'TARGET')

In [5]:
from sklearn.preprocessing import Imputer, MinMaxScaler
imputer = Imputer(strategy = 'median')

fe_training_data = imputer.fit_transform(fe_training_data)
fe_testing_data = imputer.transform(fe_testing_data)

scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(fe_training_data)
fe_training_data = scaler.transform(fe_training_data)
fe_testing_data = scaler.transform(fe_testing_data)

In [6]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C=0.0001, verbose = 1, n_jobs = -1)
log_reg.fit(fe_training_data, train_labels)

  " = {}.".format(self.n_jobs))


[LibLinear]

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [7]:
log_reg_predictions = log_reg.predict_proba(fe_testing_data)[:,1]

In [11]:
submit = fe_testing_data[['SK_ID_CURR']]
submit['TARGET'] = log_reg_predictions

submit.to_csv('../Model_Predictions/fe_logisticregression.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
