In [2]:
from random import randrange
from random import seed
from math import sqrt
from csv import reader

def load_csv(filename):
    dataset = list()
    with open(filename,'r') as file:
        read = reader(file)
        for row in read:
            if not row:
                continue
            dataset.append(row)
    return dataset

def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

def minmax_func(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        column_values = [row[i] for row in dataset]
        max_value = max(column_values)
        min_value = min(column_values)
        minmax.append([min_value,max_value])
    return minmax

def normalize(dataset,minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i]-minmax[i][0])/(minmax[i][1]-minmax[i][0])

def cross_validation(dataset, n_folds):
    dataset_split = list()
    copy = list(dataset)
    fold_size = int(len(dataset)/n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(copy))
            fold.append(copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def rmse_metric(actual,predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error +=  (prediction_error**2)
        rmse_error = sum_error / float(len(actual))
    return sqrt(rmse_error)

def evaluate_algorithm(dataset,algorithm,n_folds,l_rate,n_epoch):
    folds = cross_validation(dataset,n_folds)
    scores = list()
    for fold in folds:
        train = list(folds)
        train.remove(fold)
        train=sum(train,[])
        test = list()
        for row in fold:
            row_copy = list(row)
            test.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train,test,l_rate,n_epoch)
        actual = [row[-1] for row in fold]
        rmse = rmse_metric(actual,predicted)
        scores.append(rmse)
    return scores

def predict(row,coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i+1]*row[i]
    return yhat

def coefficients_sgd(train,l_rate,n_epoch):
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        for row in train:
            yhat = predict(row , coef)
            error = yhat - row[-1]
            coef[0] = coef[0] - l_rate * error
            for i in range(len(row)-1):
                coef[i+1] = coef[i+1] - l_rate * error * row[i]
    return coef

def linear_regression_sgd(train,test,l_rate,n_epoch):
    predictions = list()
    coef = coefficients_sgd(train,l_rate,n_epoch)
    for row in test:
        yhat = predict(row,coef)
        predictions.append(yhat)
    return predictions

seed(1)

filename = 'winequality-white.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
    str_column_to_float(dataset,i)

minmax_val = minmax_func(dataset)
normalize(dataset,minmax_val)

n_folds = 5
l_rate = 0.01
n_epoch = 50

scores = evaluate_algorithm(dataset,linear_regression_sgd,n_folds,l_rate,n_epoch)
print('Scores : %s'%scores)
print('Mean RMSE : %.3f'%(sum(scores)/float(len(scores))))


            

FileNotFoundError: [Errno 2] No such file or directory: 'winequality-white.csv'