# Lab 7: Spam Classification with Logistic Regression From Scratch
## by Tiffany Nguyen
### Part 1: Read Data 

In [28]:
import numpy as np

In [29]:
# Read all training data
trainDataX = np.empty((0, 57))
trainDataY = np.empty((0))

#read train data
with open("spambase 2/spam-train", "r") as filestream:
  for line in filestream:
    currentline = line.strip().split(",")
    trainDataX = np.append(trainDataX, [np.array(currentline[:-1], dtype=float)], axis=0)
    trainDataY = np.append(trainDataY, float(currentline[-1])) # last column is label

# print output
print("trainDataX shape:", trainDataX.shape)
print("trainDataY shape:", trainDataY.shape)
print("trainDataX: \n", trainDataX)
print("trainDataY: \n", trainDataY)

trainDataX shape: (3065, 57)
trainDataY shape: (3065,)
trainDataX: 
 [[  0.      0.      0.    ...   6.718  33.    215.   ]
 [  0.      0.      0.    ...   2.044  22.     92.   ]
 [  0.      0.53    0.    ...   4.555  51.    123.   ]
 ...
 [  0.      0.      0.    ...   2.307   8.     30.   ]
 [  0.33    0.      0.    ...   1.271   5.     75.   ]
 [  0.31    0.      0.62  ...   1.142   3.     88.   ]]
trainDataY: 
 [1. 1. 1. ... 0. 0. 0.]


In [30]:
# Read all testing data
testDataX = np.empty((0, 57))
testDataY = np.empty((0))

#read test data
with open("spambase 2/spam-test", "r") as filestream:
  for line in filestream:
    currentline = line.strip().split(",")
    testDataX = np.append(testDataX, [np.array(currentline[:-1], dtype=float)], axis=0)
    testDataY = np.append(testDataY, float(currentline[-1])) # last column is label

# print output
print("testDataX shape:", testDataX.shape)
print("testDataY shape:", testDataY.shape)
print("testDataX: \n", testDataX)
print("testDataY: \n", testDataY)

testDataX shape: (1536, 57)
testDataY shape: (1536,)
testDataX: 
 [[0.000e+00 1.428e+01 0.000e+00 ... 1.800e+00 5.000e+00 9.000e+00]
 [5.100e-01 4.300e-01 2.900e-01 ... 6.590e+00 7.390e+02 2.333e+03]
 [7.900e-01 1.900e-01 9.000e-02 ... 2.324e+00 1.900e+01 3.650e+02]
 ...
 [0.000e+00 0.000e+00 7.600e-01 ... 2.441e+00 1.900e+01 2.490e+02]
 [0.000e+00 0.000e+00 8.700e-01 ... 1.601e+00 1.100e+01 2.770e+02]
 [0.000e+00 0.000e+00 0.000e+00 ... 1.103e+00 3.000e+00 3.200e+01]]
testDataY: 
 [0. 1. 1. ... 1. 0. 1.]


### Part 2: Perform Z-score Normalization
Normalization is important so that features with larger scales don't have a larger influence on the model.

Z-score normalization equation: $$Z=\frac{X-\mu}{\sigma}$$

In [31]:
# get mean and standard deviation from training data
mean = np.mean(trainDataX, axis=0)
std = np.std(trainDataX, axis=0)
print("mean:", mean)
print("standard deviation:", std)

mean: [1.07367047e-01 2.08107667e-01 2.88600326e-01 6.29396411e-02
 3.15771615e-01 9.48123980e-02 1.15670473e-01 1.02619902e-01
 8.88026101e-02 2.44913540e-01 5.70048940e-02 5.53249592e-01
 9.29885808e-02 6.11941272e-02 4.60391517e-02 2.42646003e-01
 1.41954323e-01 1.86929853e-01 1.65356607e+00 8.11680261e-02
 7.91569331e-01 1.24639478e-01 1.00597064e-01 8.94681892e-02
 5.21073409e-01 2.56143556e-01 7.60407830e-01 1.20991843e-01
 9.69168026e-02 1.00707993e-01 6.22871126e-02 4.35986949e-02
 9.31223491e-02 4.47373573e-02 1.00893964e-01 9.76835237e-02
 1.31781403e-01 1.04567700e-02 7.94388254e-02 6.31908646e-02
 4.73376835e-02 1.42339315e-01 4.48091354e-02 7.67993475e-02
 2.99207178e-01 1.79725938e-01 4.91027732e-03 3.11060359e-02
 3.88548124e-02 1.36330506e-01 1.69128874e-02 2.55486460e-01
 8.03073409e-02 4.19836868e-02 4.97611909e+00 4.96247961e+01
 2.88137684e+02]
standard deviation: [3.14626401e-01 1.23933314e+00 5.15694908e-01 1.33776681e+00
 7.00478932e-01 2.64696272e-01 4.13395224e

In [32]:
# normalize training data
normTrainDataX = (trainDataX - mean)/std
print(normTrainDataX)

# normalize testing data
normTestDataX = (testDataX - mean)/std

[[-0.3412525  -0.16791907 -0.55963385 ...  0.0540969  -0.12957863
  -0.11426565]
 [-0.3412525  -0.16791907 -0.55963385 ... -0.09106165 -0.21531591
  -0.30643301]
 [-0.3412525   0.25973027 -0.55963385 ... -0.01307853  0.01071875
  -0.25800059]
 ...
 [-0.3412525  -0.16791907 -0.55963385 ... -0.08289376 -0.32443609
  -0.40329786]
 [ 0.70761052 -0.16791907 -0.55963385 ... -0.1150684  -0.34781899
  -0.33299273]
 [ 0.64404307 -0.16791907  0.64262739 ... -0.1190747  -0.36340759
  -0.31268235]]


### Part 3: Add Bias Feature

In [33]:
# add dummy feature to train data (column of ones for bias) 
dummyCol = np.ones((normTrainDataX.shape[0], 1))
normTrainDataX = np.concatenate((normTrainDataX, dummyCol), axis=1)
print(normTrainDataX)

[[-0.3412525  -0.16791907 -0.55963385 ... -0.12957863 -0.11426565
   1.        ]
 [-0.3412525  -0.16791907 -0.55963385 ... -0.21531591 -0.30643301
   1.        ]
 [-0.3412525   0.25973027 -0.55963385 ...  0.01071875 -0.25800059
   1.        ]
 ...
 [-0.3412525  -0.16791907 -0.55963385 ... -0.32443609 -0.40329786
   1.        ]
 [ 0.70761052 -0.16791907 -0.55963385 ... -0.34781899 -0.33299273
   1.        ]
 [ 0.64404307 -0.16791907  0.64262739 ... -0.36340759 -0.31268235
   1.        ]]


In [34]:
# add dummy feature to test data (column of ones for bias) 
dummyCol = np.ones((normTestDataX.shape[0], 1))
normTestDataX = np.concatenate((normTestDataX, dummyCol), axis=1)

### Part 4: Build a Logistic Regresion Model with Gradient Descent
Use logistic regression to perform binary classification

Sigmoid function used: $$S(r)=\frac{1}{1+e^{-r}}$$
Logistic Regression Weight Update Equation: $$w_{t+1}=w_t + \alpha[X^T(y-S(Xw_t))]$$

In [35]:
# declare variables
alpha = 0.0001
convergeMax = 10**-3

# Perform gradient descent with logistic regression
def problem2(samplesX, samplesY):
  np.random.seed(22)
  curWeight = np.random.normal(0, 1, samplesX[0].shape)
  prevWeight = -1 * np.ones(samplesX[0].shape)

  while(max(np.subtract(curWeight, prevWeight)) > convergeMax):
    prevWeight = curWeight
    sigmoid = 1/(1+np.exp(-1 * np.dot(samplesX, prevWeight)))
    update = alpha * (np.dot(samplesX.T, np.subtract(samplesY, sigmoid)))
    curWeight = prevWeight + update

    # get and print current RMSE
    predTrain = 1/(1+np.exp(-1 *np.dot(curWeight, np.transpose(samplesX))))
    curRMSE =  np.sqrt(np.sum(np.square(predTrain - samplesY))/len(predTrain))
    print("curRMSE", curRMSE, end="\r")
  return(curWeight)

gradDescWeights = problem2(normTrainDataX, trainDataY)
print("\nweights", gradDescWeights)

curRMSE 0.24338316077347366
weights [-0.06579796 -0.33618968  0.0318817   0.4706584   0.39684911  0.21873301
  1.04933305  0.20536465  0.13813264  0.07674643  0.02043958 -0.10012022
 -0.05541302  0.0071      0.47112823  0.68383264  0.38888239  0.10084451
  0.12777687  0.27918425  0.21825479  0.370822    1.14822769  0.83288873
 -1.53258254 -0.90819946 -1.05280648  0.28917305 -0.95683862 -0.19882206
 -0.11006165 -2.13318186 -0.39211884  0.07270701 -1.30845069  0.2222756
  0.07423266  0.05218067 -0.32100014 -0.26928943 -0.47556772 -1.12525713
 -0.39885262 -0.59308406 -0.58353188 -0.75232584 -0.46067251 -0.55090784
 -0.34294318 -0.02327669 -0.09147705  0.53980336  1.18986075  0.63355707
  0.8652439   0.96517597  0.28292326 -1.36175323]


In [36]:
# Classify output
def classify(samplesX, samplesY):
  numCorrect = 0
  numIncorrect = 0

  # make prediction
  sigmoidPred = 1/(1+np.exp(-1 * np.dot(samplesX, gradDescWeights)))
  pred = [1 if curPred >= 0.5 else 0 for curPred in sigmoidPred]

  # get and print accuracy
  for i in range(len(pred)):
    if(pred[i] == samplesY[i]):
      numCorrect+=1
    else:
      numIncorrect+=1
  print(" Num correct:", numCorrect)
  print(" Num incorrect:", numIncorrect)
  print(" Accuracy:", round(numCorrect/(numCorrect+numIncorrect), 5))

print("Classifying Training Data:")
classify(normTrainDataX, trainDataY)
print("Classifying Testing Data:")
classify(normTestDataX, testDataY)

Classifying Training Data:
 Num correct: 2841
 Num incorrect: 224
 Accuracy: 0.92692
Classifying Testing Data:
 Num correct: 1424
 Num incorrect: 112
 Accuracy: 0.92708
