In [5]:
"""Input and output helpers to load in data."""

import csv
import numpy as np


def read_dataset(input_csv_file):
    """Read data into a python list.

    Args:
        input_csv_file: Path to the data csv file.

    Returns:
        dataset(dict): A python dictionary with the key value pair of
            (example_id, example_feature).

            example_feature is represented with a tuple
            (Id, BldgType, OverallQual, GrLivArea, GarageArea)

            For example, the first row will be in the train.csv is
            example_id = 1
            example_feature = (1,1Fam,7,1710,548)
    """
    dataset = {}

    # Imeplemntation here.
    with open(input_csv_file) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            dataset[row['Id']] = (row['Id'],
                                  row['BldgType'],
                                  row['OverallQual'],
                                  row['GrLivArea'],
                                  row['GarageArea'],
                                  row['SalePrice'])

    return dataset

dst = read_dataset("./assignment2_data/train.csv")


In [60]:
"""Implements feature extraction and data processing helpers.
"""

def preprocess_data(dataset,
                    feature_columns=[
                        'Id', 'BldgType', 'OverallQual'
                        'GrLivArea', 'GarageArea'
                    ],
                    squared_features=False,
                    ):
    """Processes the dataset into vector representation.

    When converting the BldgType to a vector, use one-hot encoding, the order
    has been provided in the one_hot_bldg_type helper function. Otherwise,
    the values in the column can be directly used.

    If squared_features is true, then the feature values should be
    element-wise squared.

    Args:
        dataset(dict): Dataset extracted from io_tools.read_dataset
        feature_columns(list): List of feature names.
        squred_features(bool): Whether to square the features.

    Returns:
        processed_datas(list): List of numpy arrays x, y.
            x is a numpy array, of dimension (N,K), N is the number of example
            in the dataset, and K is the length of the feature vector.
            Note: BldgType when converted to one hot vector is of length 5.
            Each row of x contains an example.
            y is a numpy array, of dimension (N,1) containing the SalePrice.
    """
    columns_to_id = {'Id': 0, 'BldgType': 1, 'OverallQual': 2,
                     'GrLivArea': 3, 'GarageArea': 4, 'SalePrice': 5}

    x = []
    y = []
    
    for k, v in dataset.items():
        x_helper = []
#         x_helper.append(float(v[0]))
        bt_vec = one_hot_bldg_type(v[1])
        for l in bt_vec:
            x_helper.append(l)
        for rem in range(2, 5):
            x_helper.append(float(v[rem]))
        x.append(x_helper)
        y.append(float(v[5]))

    x = np.array(x)
    y = np.array(y)[np.newaxis]
    y = y.T
    if squared_features is True:
        x = np.square(x)
        y = np.square(y)
        
    processed_dataset = [x, y]
    return processed_dataset


def one_hot_bldg_type(bldg_type):
    """Builds the one-hot encoding vector.

    Args:
        bldg_type(str): String indicating the building type.

    Returns:
        ret(list): A list representing the one-hot encoding vector.
            (e.g. for 1Fam building type, the returned list should be
            [1,0,0,0,0].
    """
    type_to_id = {'1Fam': 0,
                  '2FmCon': 1,
                  'Duplx': 2,
                  'TwnhsE': 3,
                  'TwnhsI': 4,
                  }
#     for tb in type_to_id:
#         if bldg_type == tb:
#             index = int(type_to_id[tb])
    index = type_to_id[bldg_type]
    ret = [0, 0, 0, 0, 0]
    ret[index] = 1
    return ret

# result = one_hot_bldg_type("Duplx")
print(preprocess_data(dst)[0])


[[1.000e+00 0.000e+00 0.000e+00 ... 7.000e+00 1.710e+03 5.480e+02]
 [1.000e+00 0.000e+00 0.000e+00 ... 6.000e+00 1.262e+03 4.600e+02]
 [1.000e+00 0.000e+00 0.000e+00 ... 7.000e+00 1.786e+03 6.080e+02]
 ...
 [1.000e+00 0.000e+00 0.000e+00 ... 6.000e+00 1.442e+03 6.150e+02]
 [1.000e+00 0.000e+00 0.000e+00 ... 3.000e+00 1.077e+03 2.100e+02]
 [1.000e+00 0.000e+00 0.000e+00 ... 7.000e+00 1.208e+03 6.320e+02]]


In [56]:
processed_dataset = preprocess_data(dst)
sizeofds = processed_dataset[0].shape[0] 
shuf = np.arange(sizeofds)
np.random.shuffle(shuf)
processed_dataset[0] = processed_dataset[0][shuf]
processed_dataset[1] = processed_dataset[1][shuf]
batch_size = 16
i = 0
count = 0
num_steps = 10000
while i + batch_size <= sizeofds:
        count = count + 1
        if count > num_steps:
            break
        x_batch = processed_dataset[0][i:i+batch_size,:]
        y_batch = processed_dataset[1][:, i:i+batch_size]
#         update_step(x_batch, y_batch, model, learning_rate)
        i = i + batch_size
print(count)  

62


In [63]:
N = 1000
print(type(N))
 #       print("actual in forward x",N)
ndims = 8
x_cur = np.ones(N, ndims + 1)
#     x_cur[:,:-1] = x
#     f = np.matmul(w_t, x_cur)

<class 'int'>


TypeError: data type not understood

In [64]:
a = np.ones(2, 9)

TypeError: data type not understood