## Importación de librerías

In [1]:
import csv
import math
import numpy as np
import time

## Constantes

In [2]:
nIter = 20
learningRate = 0.5
lambda_reg = 0.1
numberOfFeatures = 11
path = "botnet.csv"

## Funciones del sistema
### readFile

Dentro de la función readFile utilizamos una función axuiliar llamada row2Tuple, la cual se encarga de tomar el 

In [3]:
def readFile(filename): 
    with open(filename) as csvfile:
        file = csv.reader(csvfile, delimiter=',')
        dataset = []
        for i, row in enumerate(file):
            floatArray=[float(number) for number in row]
            dataset.append(tuple((np.array(floatArray[:-1]),floatArray[-1])))
            #if i == 100: 
            #    break
    return np.array(dataset)

Tenemos que conseguir un array de numpy que dentro de cada elemento tiene una tupla (x,y) que en x tiene un array y en y un numero

### Normalize

In [4]:
def normalize (RDD_Xy):
    """
    This function normalizes RDD for each column to N(0,1)
    :param RDD_Xy: is an RDD containing data examples. Each record of the RDD is a tuple (X,y). 
        “X” is an array containing the 11 features (float number) of an example 
        “y” is the label of the example (integer 0/1) 
    :return: An RDD rescaled to N(0,1) in each column (mean=0, standard deviation=1)
    """
    for index in range(11):
        column = np.array([register[0][index] for register in RDD_Xy]) #takes all w_i elements
        mean=np.mean(column)
        std=np.std(column)
        #print("Columna ", index, "mean", mean, "std",std)
        columnNormalized = (column-mean)/std #normalize
        for rddElement, columnElement in zip(RDD_Xy, columnNormalized):
            rddElement[0][index] = columnElement
        #for position, element in enumerate(columnNormalized):
        #    RDD_Xy[position][0][index] = element #set element to original rdd
    return RDD_Xy

In [5]:
def modelSigmoid(X,W,b):
    #esta bien
    y = np.dot(X,W)+b
    #np.reciprocal computes 1/x
    return 1/(1+np.exp(-y))

In [6]:
def doDb(dataset,w,b,m):
    sumatory = 0
    for row in dataset:
        y_prob=modelSigmoid(row[0],w,b)
        y=row[1]
        sumatory += (y_prob-y)
    return sumatory/m

def doDw(dataset,w,b,m,k):
    sumatory = 0
    for row in dataset:
        y_prob=modelSigmoid(row[0],w,b)
        y=row[1]
        sumatory += ((y_prob-y)*row[0][k])
    reg = lambda_reg*w[k]
    return (sumatory + reg) /m

In [7]:
def train (RDD_Xy, iterations, learning_rate, lambda_reg):
    """
    This function computes accuracy of the model
    :param RDD_Xy: RDD containing data examples. Each record of the RDD is a tuple (X,y). “X” is an array containing 
        the 11 features (float number) of an example “y” is the label of the example (integer 0/1) 
    :param iterations: number of iterations of the optimization loop
    :param learning_rate: learning rate of the gradient descent
    :param lambda_reg: regularization rate
    :return: A list or array containing the weights “w” and bias “b” at the end of the training process
    """
    
    w = np.random.rand(11)
    b = np.random.random_sample()
    dw = np.empty(11)
    db = 0
    m = len(RDD_Xy)
    listOfAcc = []
    
    for iteration in range(iterations):
        start = time.perf_counter()
        
        #compute derivates
        for k in range(11):
            dw[k] = doDw(RDD_Xy,w,b,m,k)   
        db = doDb(RDD_Xy,w,b,m)
        
        #apply derivates
        for k in range(11):
            w[k] -= (learning_rate * dw[k])
        b -= (learning_rate * db)
        
        end = time.perf_counter()
        
        acc = accuracy(w,b,data)
        listOfAcc.append(acc)
        
        endAccuracy = time.perf_counter()
        print("Iteration {:2d} acc: {:.3f}, time training: {:.3f} time accuracy: {:.3f}".format(
            iteration,acc,end-start,endAccuracy - end))
    return np.append(w,b), np.array(listOfAcc)

In [8]:
def accuracy (w, b, RDD_Xy):
    """
    This function computes accuracy of the model
    :param w: weights
    :param b: bias
    :param RDD_Xy: RDD containing examples to be predicted
    :return: The number of predictions that are correct divided by the number of records (examples) in RDD_xy
    """
    predictionsCorrect = 0
    for row in RDD_Xy:
        if row[1] == predict(w,b,row[0]):
            predictionsCorrect += 1
            
    return predictionsCorrect/len(RDD_Xy)

In [9]:
def predict (w, b, X):
    """
    Predict function can be used for predicting a single example
    :param w: weights
    :param b: bias
    :param X: Example to be predicted
    :return: A value (0/1) corresponding to the prediction of X
    """
    return np.round(modelSigmoid(X,w,b))

In [10]:
start = time.perf_counter()
data = readFile(path)
endRead = time.perf_counter()
data = normalize(data)
endNormalize = time.perf_counter()

print("Read time: {:.3f}, normalize time: {:.3f}".format(endRead-start,endNormalize - endRead))

Columna  0 mean 1281.5382729466037 std 1605.893224918472
Columna  1 mean 21282.76719488655 std 24117.47504288103
Columna  2 mean 6948.0571813618335 std 16394.945350996022
Columna  3 mean 62631.196091182705 std 134264.21801130896
Columna  4 mean 122198594.92857051 std 233920713.61978725
Columna  5 mean 15722236.101999663 std 44123779.4918666
Columna  6 mean 9.107313932263116 std 5.28172580720774
Columna  7 mean 1.8897547959135519 std 2.101345610211019
Columna  8 mean 124.27015928410006 std 90.3431222389383
Columna  9 mean 2130603014.3776083 std 713843442.8527304
Columna  10 mean 2261491802.491431 std 1301531949.9426758
Read time: 34.942, normalize time: 27.012


In [11]:
ws, listOfAcc = train(data, nIter, learningRate, lambda_reg)
w = ws[:-1]
b = ws[-1]
acc = accuracy(w,b,data)
print("acc: ", acc)

Iteration  0 acc: 0.458, time training: 152.287 time accuracy: 17.264
Iteration  1 acc: 0.516, time training: 148.728 time accuracy: 17.744
Iteration  2 acc: 0.583, time training: 149.026 time accuracy: 17.463


KeyboardInterrupt: 

# Gráficas

Instalarse con conda install matplotlib

In [12]:
%matplotlib inline

import matplotlib.pyplot as plt

x = [i for i in range(nIter)]

plt.plot(listOfAcc,marker="o")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Evolution of accuraccy during training")
plt.xticks(x)
plt.show()

NameError: name 'listOfAcc' is not defined

In [None]:
b