# DSX Example: Build Classifier using Binary Logistic Regression
### Environment
```
Python 2.7
Spark 2.0
scikit-learn 0.17
```

### History
Notebook created 10 Feb 17

In [None]:
# Use this magik to get a list of all installed libraries and their versions
!pip list --isolated

## STEP 1:  Load Data
In the Data Preparation notebook, we built Training and Testing data sets base on the data from the UC Irvine Machine Learning Repository [ http://archive.ics.uci.edu/ml ] and saved them to Bluemix Object Store.
<p>  Here, we load the prepared data and then use the scikit-learn Logistic Regression algorithm to build a binary classifier.  Then we do some additional work to validate and improve the model.

In [18]:
import pandas as pd
import numpy as np

In [3]:
# The code was removed by DSX for sharing.

In [5]:
from io import StringIO
import requests
import json

# This function accesses a file in your Object Storage. The definition contains your credentials.
# You might want to remove those credentials before you share your notebook.
def get_object_storage_file(container, filename, credentials):
    """This functions returns a StringIO object containing
    the file content from Bluemix Object Storage."""

    url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
    data = {'auth': {'identity': {'methods': ['password'],
            'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domain_id']},
            'password': credentials['password']}}}}}
    headers1 = {'Content-Type': 'application/json'}
    resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)
    resp1_body = resp1.json()
    for e1 in resp1_body['token']['catalog']:
        if(e1['type']=='object-store'):
            for e2 in e1['endpoints']:
                        if(e2['interface']=='public'and e2['region']=='dallas'):
                            url2 = ''.join([e2['url'],'/', container, '/', filename])
    s_subject_token = resp1.headers['x-subject-token']
    headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}
    resp2 = requests.get(url=url2, headers=headers2)
    return StringIO(resp2.text)

### Load the Training set

In [5]:
df_TrainStd = pd.read_csv(get_object_storage_file('ExampleBinaryLR', 'TrainStd.csv', credentials))
df_TrainStd.head()

Unnamed: 0,age,cap_gain,cap_loss,education_num,hrs_per_week,sample_weight,work_class_unknown,work_class_Federal_gov,work_class_Local_gov,work_class_Never_worked,...,native_country_Portugal,native_country_Puerto_Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United_States,native_country_Vietnam,native_country_Yugoslavia
0,0.005731,0.010964,-0.020042,0.194621,-0.004464,-0.076243,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.156416,-0.010777,-0.020042,0.194621,-0.279974,-0.072308,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.007968,-0.010777,-0.020042,-0.072045,-0.004464,0.017568,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.197512,-0.010777,-0.020042,-0.205379,-0.004464,0.030523,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.144954,-0.010777,-0.020042,0.194621,-0.004464,0.100943,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Load the Test set

In [19]:
df_TestStd = pd.read_csv(get_object_storage_file('ExampleBinaryLR', 'TestStd.csv', credentials))
df_TestStd.head()

Unnamed: 0,age,cap_gain,cap_loss,education_num,hrs_per_week,sample_weight,work_class_unknown,work_class_Federal_gov,work_class_Local_gov,work_class_Never_worked,...,native_country_Puerto_Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United_States,native_country_Vietnam,native_country_Yugoslavia,native_country_Holand_Netherlands
0,-0.188595,-0.010819,-0.023315,-0.20486,-0.004002,0.0253,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,-0.010513,-0.010819,-0.023315,-0.071527,0.098038,-0.067453,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,-0.147499,-0.010819,-0.023315,0.128473,-0.004002,0.099881,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0.071679,0.066062,-0.023315,-0.00486,-0.004002,-0.019712,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,-0.284486,-0.010819,-0.023315,-0.00486,-0.106043,-0.058188,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


### Load the Training labels

In [7]:
df_TrainingLabels = pd.read_csv(get_object_storage_file('ExampleBinaryLR', 'TrainIncomeClassLabels.csv', credentials))
df_TrainingLabels.columns = ["gt_50K"]
df_TrainingLabels.head()

Unnamed: 0,gt_50K
0,0
1,0
2,0
3,0
4,0


### Load the Test labels

In [8]:
df_TestLabels = pd.read_csv(get_object_storage_file('ExampleBinaryLR', 'TestIncomeClassLabels.csv', credentials))
df_TestLabels.columns = ["gt_50K"]
df_TestLabels.head()

Unnamed: 0,gt_50K
0,0
1,0
2,0
3,0
4,0


## STEP 2:  Build a Logistic Regression Classifier


In [9]:
# Algorithm options:
#    sag : stochastic average gradient
#    liblinear : linear, good for smaller datasets
#    lbfgs :
#    newton-cg :

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000.0, solver='liblinear', class_weight='balanced', random_state=0, verbose=1 )
lr.fit(df_TrainStd, df_TrainingLabels.gt_50K )

[LibLinear]

LogisticRegression(C=1000.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [10]:
# get probabilities of prediction
df_probs = pd.DataFrame( lr.predict_proba( df_TrainStd ))

In [11]:
df_probs[0:5]

Unnamed: 0,0,1
0,0.702431,0.297569
1,0.331099,0.668901
2,0.920077,0.079923
3,0.788137,0.211863
4,0.136517,0.863483


In [12]:
df_income_class = pd.DataFrame({"p_income_class" : lr.predict( df_TrainStd )})

In [13]:
df_income_class[0:5]

Unnamed: 0,p_income_class
0,0
1,1
2,0
3,0
4,1


In [14]:
df_TrainingLabels.count()

gt_50K    32561
dtype: int64

In [15]:
df_test_results = pd.concat([df_TrainingLabels, df_income_class.p_income_class, df_probs, ], axis=1)
df_test_results.columns = ['gt_50K', 'p_gt_50K', 'prob_0', 'prob_1']

In [16]:
df_test_results[0:10]

Unnamed: 0,gt_50K,p_gt_50K,prob_0,prob_1
0,0,0,0.702431,0.297569
1,0,1,0.331099,0.668901
2,0,0,0.920077,0.079923
3,0,0,0.788137,0.211863
4,0,1,0.136517,0.863483
5,0,1,0.059384,0.940616
6,0,0,0.994555,0.005445
7,1,1,0.307821,0.692179
8,1,1,0.030934,0.969066
9,1,1,0.028506,0.971494


In [None]:
#df_test_results[(df_test_results.gt_50K == df_test_results.p_gt_50K) & (df_test_results.gt_50K == True)]

## STEP 3:  Construct a Confusion Matrix to see how well our classifier is working


In [81]:
from sklearn import metrics as skm
cm = skm.confusion_matrix(df_test_results.gt_50K, df_test_results.p_gt_50K)
df_cm = pd.DataFrame(cm)
df_cm.columns = ["Less","More"]
df_cm

Unnamed: 0,Less,More
0,19813,4907
1,1178,6663


In [60]:
(23021+4735) / 32561.

0.8524308221491969

In [82]:
df_cums = pd.DataFrame(df_cm["Less"] + df_cm["More"])
df_cums.columns = ["Total"]
df_cums

Unnamed: 0,Total
0,24720
1,7841


In [83]:
#Convert absolute counts to percentages
df_cm_pct = df_cm.div(df_cums["Total"], axis='rows')

###  Understanding the Confusion Matrix 
The Matrix has the Actual values on the vertical axis, and the Predicted values on the horizontal.  What this tells us, then, is that our classifier got 93.1% of the less than 50K cases right, and miss-classified 6.9% of them as >50K. It classified 60.4% of the >50K cases correctly and mis-classfied 39.6% of them as <50K.

In [84]:
df_cm_pct

Unnamed: 0,Less,More
0,0.801497,0.198503
1,0.150236,0.849764


In [None]:
df_cums.sum(axis=0)

In [None]:
df_cm_pct.sum(axis=1)

## Persist the Trained Model
First we serialize it using pickle. Then we write the serialized model to Bluemix Object Store

In [17]:
import pickle
s = pickle.dumps(lr)

In [19]:
s

'ccopy_reg\n_reconstructor\np0\n(csklearn.linear_model.logistic\nLogisticRegression\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nS\'warm_start\'\np6\nI00\nsS\'C\'\np7\nF1000.0\nsS\'max_iter\'\np8\nI100\nsS\'verbose\'\np9\nI1\nsS\'intercept_scaling\'\np10\nI1\nsS\'dual\'\np11\nI00\nsS\'fit_intercept\'\np12\nI01\nsS\'classes_\'\np13\ncnumpy.core.multiarray\n_reconstruct\np14\n(cnumpy\nndarray\np15\n(I0\ntp16\nS\'b\'\np17\ntp18\nRp19\n(I1\n(I2\ntp20\ncnumpy\ndtype\np21\n(S\'i8\'\np22\nI0\nI1\ntp23\nRp24\n(I3\nS\'<\'\np25\nNNNI-1\nI-1\nI0\ntp26\nbI00\nS\'\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\'\np27\ntp28\nbsS\'solver\'\np29\nS\'liblinear\'\np30\nsS\'n_iter_\'\np31\ng14\n(g15\n(I0\ntp32\ng17\ntp33\nRp34\n(I1\n(I1\ntp35\ng21\n(S\'i4\'\np36\nI0\nI1\ntp37\nRp38\n(I3\nS\'<\'\np39\nNNNI-1\nI-1\nI0\ntp40\nbI00\nS\'\\x0c\\x00\\x00\\x00\'\np41\ntp42\nbsS\'penalty\'\np43\nS\'l2\'\np44\nsS\'multi_class\'\np45\nS\'ovr\'\np46\nsS\'random_state\'\np47\nI0\ns

In [22]:
lr2 = pickle.loads(s)
lr2

LogisticRegression(C=1000.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [27]:
import requests  
import json

def put_pickle(credentials, pickle_string, file_name):  
    """This function writes a pickle to Object Storage V3 """
    
    url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
    data = {'auth': {'identity': {'methods': ['password'],
            'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domain_id']},
            'password': credentials['password']}}}}}
    headers1 = {'Content-Type': 'application/json'}
    resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)
    resp1_body = resp1.json()
    
    for e1 in resp1_body['token']['catalog']:
        if(e1['type']=='object-store'):
            for e2 in e1['endpoints']:
                        if(e2['interface']=='public'and e2['region']=='dallas'):
                            url2 = ''.join([e2['url'],'/', credentials['container'], '/', file_name])
    s_subject_token = resp1.headers['x-subject-token']
    headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'text/plain'}
    resp2 = requests.put(url=url2, headers=headers2, data = pickle_string )
    return resp2

In [28]:
put_pickle(credentials, s, "logistic_regression_01.p")

<Response [201]>

## Use the Stored Classifier
Here we fetch the pickle from Bluemix Object Store, reinstantiate it as a scikit-learn classifier object, and then use it to score some data

In [9]:
def get_object_pickle_file(container, filename, credentials):
    """This functions returns a string containing
    the file content from Bluemix Object Storage."""

    url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
    data = {'auth': {'identity': {'methods': ['password'],
            'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domain_id']},
            'password': credentials['password']}}}}}
    headers1 = {'Content-Type': 'application/json'}
    resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)
    resp1_body = resp1.json()
    for e1 in resp1_body['token']['catalog']:
        if(e1['type']=='object-store'):
            for e2 in e1['endpoints']:
                        if(e2['interface']=='public'and e2['region']=='dallas'):
                            url2 = ''.join([e2['url'],'/', container, '/', filename])
    s_subject_token = resp1.headers['x-subject-token']
    headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'text/plain'}
    resp2 = requests.get(url=url2, headers=headers2)
    return resp2.text

In [10]:
s2 = get_object_pickle_file('ExampleBinaryLR', 'logistic_regression_01.p', credentials)

In [15]:
import pickle
lr3 = pickle.loads(s2)
lr3

LogisticRegression(C=1000.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [23]:
print lr3.predict_proba(df_TestStd[1:2])

[[ 0.79356138  0.20643862]]
