### Using regularized logistic regression to classify email

In [1]:
import scipy.io
import utils
import numpy as np
from sklearn import linear_model

# No modifications in this script
# complete the functions in util.py; then run the script

# load the spam data in

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,type,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print "best_lambda = ", best_lambda

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True)
    lreg.fit(X,ytrain)
    print "Coefficients = ", lreg.intercept_,lreg.coef_
    predy = lreg.predict(Xt)
    print "Accuracy on set aside test set for ", type, " = ", np.mean(predy==ytest)

print "L2 Penalty experiments -----------"
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print "L1 Penalty experiments -----------"
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------
best_lambda =  5.1
Coefficients =  [-1.5157442] [[-0.01840684 -0.21266707  0.13182708  0.44223938  0.26062597  0.18525779
   0.89365549  0.3144773   0.14226525  0.06250421 -0.04832104 -0.15022897
  -0.05030445  0.02958276  0.23882469  0.75866615  0.46075246  0.08858727
   0.25983544  0.21891496  0.26201495  0.40558311  0.74337909  0.26638332
  -1.68322246 -0.61771416 -1.60710912 -0.11831285 -0.62573686 -0.17975929
  -0.28377359 -0.20404886 -0.41382315 -0.38475413 -0.32583552  0.30817705
   0.00445086 -0.14093724 -0.37210461 -0.0946275  -0.57102237 -0.89163658
  -0.31089036 -0.67414262 -0.76689376 -1.09864526 -0.1307871  -0.63674802
  -0.32161549 -0.156286   -0.11719231  0.22978928  1.43584138  0.44903122
  -0.09875908  0.77282526  0.38089937]]
Accuracy on set aside test set for  std  =  0.921223958333
best_lambda =  0.6
Coefficients =  [-4.60944128] [[-0.45145759 -0.28466462 -0.06326929  0.68295889  1.21053267  0.91505182
   2.83046514  1.43679096  0.