In [10]:
import argparse
import numpy as np
import pandas as pd
import time

class Perceptron(object):
    mEpoch = 1000  # maximum epoch size
    w = None       # weights of the perceptron

    def __init__(self, epoch):
        self.mEpoch = epoch

    def train(self, xFeat, y):
        """
        Train the perceptron using the data

        Parameters
        ----------
        xFeat : nd-array with shape n x d
            Training data 
        y : 1d array with shape n
            Array of responses associated with training data.

        Returns
        -------
        stats : object
            Keys represent the epochs and values the number of mistakes
        """
        stats = {}
        # TODO implement this
        self.w = np.zeros(1+xFeat.shape[1])

        for epoch in range(self.mEpoch):
            mistakes = 0
            for i in range(len(xFeat)):
                delta_w = 0.01 * (y[i] - self.predict(xFeat[i]))
                self.w[1:] += delta_w * xFeat[i]
                self.w[0] += delta_w
                mistakes += int(delta_w != 0.0)
            if mistakes == 0:
                break
            stats[epoch] = mistakes
        return stats

    def predict(self, xFeat):
        """
        Given the feature set xFeat, predict 
        what class the values will have.

        Parameters
        ----------
        xFeat : nd-array with shape m x d
            The data to predict.  

        Returns
        -------
        yHat : 1d array or list with shape m
            Predicted response per sample
        """
        yHat = []
        yHat = np.dot(xFeat, self.w[1:]) + self.w[0]
        yHat = np.where(yHat >= 0.0, 1, 0)
        return yHat

    def pos_neg_words(self, data):
        weights = self.w
        words = list(data.columns.values)
        np_words = np.array(words)
        
        positive_index = np.argsort(-weights)[:15]
        pos_list = np_words[positive_index]
        negative_index = np.argsort(weights)[:15]
        neg_list = np_words[negative_index]
        return pos_list, neg_list

def calc_mistakes(yHat, yTrue):
    """
    Calculate the number of mistakes
    that the algorithm makes based on the prediction.

    Parameters
    ----------
    yHat : 1-d array or list with shape n
        The predicted label.
    yTrue : 1-d array or list with shape n
        The true label.      

    Returns
    -------
    err : int
        The number of mistakes that are made
    """
    mistake = 0
    for i in range(len(yTrue)):
        if yHat[i] != yTrue[i]:
            mistake += 1
    return mistake


def file_to_numpy(filename):
    """
    Read an input file and convert it to numpy
    """
    df = pd.read_csv(filename)
    return df.to_numpy()

In [1]:
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from perceptron import Perceptron

In [3]:
b_train = file_to_numpy('binary_train.csv')
b_test = file_to_numpy('binary_test.csv')
c_train = file_to_numpy('count_train.csv')
c_test = file_to_numpy('count_test.csv')
y_train = file_to_numpy('y_train.csv')
y_test = file_to_numpy('y_test.csv')

In [4]:
def calc_mistakes(yHat, yTrue):
    mistake = 0
    for i in range(len(yTrue)):
        if yHat[i] != yTrue[i]:
            mistake += 1
    return mistake

In [17]:
def kfold_epoch(x, y, mEpoch_list):
    epoch_mistake = {}
    k = len(mEpoch_list)
    kf = KFold(n_splits=k)
    for mEpoch in mEpoch_list:
        mistakes_list = []
        for train_i, test_i in kf.split(x):
            xTrain, xTest = x[train_i], x[test_i]
            yTrain, yTest = y[train_i], y[test_i]
            p = Perceptron(mEpoch)
            p.train(xTrain, yTrain)
            yHat = p.predict(xTest)
            mistakes = calc_mistakes(yHat, yTest)
            mistakes_list.append(mistakes)
        avg_mistake = np.mean(mistakes_list)
        epoch_mistake[mEpoch] = avg_mistake
    print(epoch_mistake)

In [34]:
mEpoch_list = [5, 10, 15, 20, 30, 60]
b_epoch = kfold_epoch(b_train, y_train, mEpoch_list)
type(b_epoch)
# b_best_epoch = min(b_epoch, key=b_epoch.get)
# print(b_epoch)
# print(b_best_epoch)

{5: 20.666666666666668, 10: 14.5, 15: 14.166666666666666, 20: 13.833333333333334, 30: 13.833333333333334, 60: 13.833333333333334}


NoneType

In [22]:
print(kfold_epoch(c_train, y_train, mEpoch_list))

{5: 34.5, 10: 137.125, 15: 30.875, 20: 21.125, 30: 20.75, 60: 16.625, 100: 14.0, 200: 14.0}


In [28]:
# 15 is the optimal number of epochs for binary dataset
b_model = Perceptron(15)
b_model.train(b_train, y_train)
yHat = b_model.predict(b_test)
# print out the number of mistakes
print("Number of mistakes on the binary test dataset:")
print(calc_mistakes(yHat, y_test))

Number of mistakes on the binary test dataset:
35


In [30]:
# 100 is the optimal number of epochs for binary dataset
c_model = Perceptron(100)
c_model.train(c_train, y_train)
yHat = c_model.predict(c_test)
# print out the number of mistakes
print("Number of mistakes on the count test dataset:")
print(calc_mistakes(yHat, y_test))

Number of mistakes on the count test dataset:
42


In [25]:
def pos_neg_words(data):
    weights = self.w
    words = list(data.columns.values)
    np_words = np.array(words)
    positive_index = np.argsort(-weights)[:15]
    pos_list = np_words[positive_index]
    negative_index = np.argsort(weights)[:15]
    neg_list = np_words[negative_index]
    return pos_list, neg_list

In [29]:
df = pd.read_csv('binary_train.csv')
pos_list, neg_list = b_model.pos_neg_words(df)
print("Binary dataset: 15 words with most positive weights: ")
print(pos_list)
print("Binary dataset: 15 words with most negative weights: ")
print(neg_list)

Binary dataset: 15 words with most positive weights: 
['renam' 'guess' 'client' 'martin' 'sign' 'amaz' 'pleasur' 'advanc'
 'mirror' 'freedom' 'william' 'thi' 'mail' 'offic' 'oppos']
Binary dataset: 15 words with most negative weights: 
['www' 'us' 'rss' 'core' 'premium' 'item' 'channel' 'settl' 'annual'
 'never' 'onc' 'servic' 'spamassassin' 'resid' 'sport']


In [31]:
df = pd.read_csv('count_train.csv')
pos_list, neg_list = c_model.pos_neg_words(df)
print("Count dataset: 15 words with most positive weights: ")
print(pos_list)
print("Count dataset: 15 words with most negative weights: ")
print(neg_list)

Count dataset: 15 words with most positive weights: 
['renam' 'monitor' 'org' 'nation' 'isn' 'blame' 'numberbit' 'martin'
 'repositori' 'season' 'manner' 'guess' 'sender' 'compar' 'monthli']
Count dataset: 15 words with most negative weights: 
['button' 'spamassassin' 'met' 'www' 'dave' 'newslett' 'filenam' 'us'
 'cnumber' 'reach' 'servic' 'the' 'numberpm' 'settl' 'hand']


In [None]:
x = pd.read_csv("xTrain_count.csv").to_numpy()
y = pd.read_csv("yTrain.csv").to_numpy()

k = 6
kf = KFold(n_splits=k)
mEpoch_list = [5, 10, 20, 50, 70, 100]

k_epoch = {}

for mEpoch in mEpoch_list:
    mistakes = []
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        # in each split, create a new perceptron
        perceptron = Perceptron(mEpoch)
        # train
        perceptron.train(xTrain, yTrain)
        # predict
        yHat = perceptron.predict(xTest)
        # calculate mistakes
        mistakes = calc_mistakes(yHat, yTest)
        mistakes.append(mistakes)
    # average mistake of the cross validation for this mEpoch value
    avg_mistake = np.mean(mistakes)
    # append the performance to the dict
    dict_performance[mEpoch] = avg_mistake

print(dict_performance)
# mEpoch = 70 would be the best choice.
perceptron = Perceptron(70)