In [14]:
import numpy as np
import scipy.io
import random
from sklearn import linear_model, cross_validation
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor

NUM_TRAINING_EXAMPLES = 5172
NUM_TEST_EXAMPLES = 5857

In [3]:
def load_dataset():
    data = scipy.io.loadmat('./data/spam_data.mat')

    X_raw = data['training_data']
    X_kaggle = data['test_data']
    y_raw = data['training_labels'].reshape(X_raw.shape[0],1)

    n = X_raw.shape[0]
    indices = np.arange(n)
    random.seed(42)
    random.shuffle(indices)
    X_raw = X_raw[indices]
    y_raw = y_raw[indices]

    #====Divide training data====
    m = int(np.floor(0.7*n))
    X = X_raw[:m]
    X_test = X_raw[m:n]
    y = y_raw[:m]
    y_test = y_raw[m:n]

    return X, y, X_test, y_test, X_kaggle

In [4]:
def accuracy(result,label):
    return (1-(abs(result - label)).sum()/np.shape(label)[0])*100

In [5]:
#==== Read data ====
X, y, X_test, y_test, X_kaggle = load_dataset()

In [6]:
#==== Preprocess data ====

# Logistic regression

In [7]:
#==== Train ====
lm = linear_model.LogisticRegression()
lm.fit(X,y.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
#==== Predict on validation set ====
result = np.asmatrix(lm.predict(X_test)).T

In [9]:
#==== Print accuracy ====
print(accuracy(result, y_test))

81.3144329897


In [10]:
#==== Predict on testing set and generate CSV file====
p = X_kaggle.shape[0]
result = np.asarray(lm.predict(X_kaggle)).reshape(p,1)
csv = np.hstack((np.arange(1,p+1).reshape(p,1),result))
np.savetxt("result.csv", csv, fmt='%1.1d', delimiter=",", header='Id,Category', comments='')

In [11]:
result

array([[0],
       [1],
       [0],
       ..., 
       [0],
       [0],
       [0]])

In [12]:
np.arange(1,p+1)

array([   1,    2,    3, ..., 5855, 5856, 5857])

In [13]:
np.hstack((np.arange(p).reshape(p,1),np.ones(p).reshape(p,1)))

array([[  0.00000000e+00,   1.00000000e+00],
       [  1.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   1.00000000e+00],
       ..., 
       [  5.85400000e+03,   1.00000000e+00],
       [  5.85500000e+03,   1.00000000e+00],
       [  5.85600000e+03,   1.00000000e+00]])

# Random forest

In [16]:
#==== Train ====
kf_total = cross_validation.KFold(len(X), n_folds=5, shuffle=True, random_state=4)

rf_model = RandomForestRegressor(n_estimators= 1000)
predicted = cross_val_predict(rf_model, X, y, cv=10)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [182]:
#==== Predict on validation set ====
result = np.asmatrix(rf_model.predict(X_test)).T

In [183]:
#==== Print accuracy ====
print(accuracy(result, y_test))

77.3161946368


In [155]:
#==== Predict on testing set and generate CSV file====
p = X_kaggle.shape[0]
result = np.asarray(lm.predict(X_kaggle)).reshape(p,1)
csv = np.hstack((np.arange(1,p+1).reshape(p,1),result))
np.savetxt("result.csv", csv, fmt='%1.1d', delimiter=",", header='Id,Category', comments='')

In [143]:
result

array([[0],
       [1],
       [0],
       ..., 
       [0],
       [0],
       [0]])

In [154]:
np.arange(1,p+1)

array([   1,    2,    3, ..., 5855, 5856, 5857])

In [151]:
np.hstack((np.arange(p).reshape(p,1),np.ones(p).reshape(p,1)))

array([[  0.00000000e+00,   1.00000000e+00],
       [  1.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   1.00000000e+00],
       ..., 
       [  5.85400000e+03,   1.00000000e+00],
       [  5.85500000e+03,   1.00000000e+00],
       [  5.85600000e+03,   1.00000000e+00]])