In [None]:
import pandas as pd
from sklearn import linear_model

In [None]:
def categorical_to_numeric(df):
    """Takes a dataframe and for every non-numeric column in the dataframe it
    maps each observed value in that column to a unique numerical value. It
    returns the dataframe mutated by the mapping and the mapping
    """
    mapping = {}

    for column in df.columns:
        if not pd.api.types.is_numeric_dtype(df[column]):
            counter = 0
            mapping[column] = {}

            for value in df[column]:
                if value in mapping[column].keys():
                    continue

                mapping[column][value] = counter
                mapping[column][counter] = value
                counter += 1

            df = df.replace({column: mapping[column]})

    return df, mapping

In [None]:
def normalize(df):
    for column in df.columns:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

    return df

In [None]:
def unnormalize(df, column, value):
    """Takes in a dataframe, a column, and a value then reverses the
    normalization on the value is if it were a value in the given column of the
    given dataframe
    """
    return value * (df[column].max() - df[column].min()) + df[column].min()

In [None]:
df = pd.read_csv('/home/anthony/src/school/cs570/cs570/project3/data/train_num.csv')
unnormalize(df, 'SalePrice', .1139)

In [None]:
# Only load train because test doesn't have the SalePrice
df = pd.read_csv('/home/anthony/src/school/cs570/cs570/project3/data/train.csv')
# We don't want to include Id in our regression because it is arbitrary
df.drop('Id', axis=1, inplace=True)
# Nan values cause issues, replace them with 0
df.fillna(0, inplace=True)

# Map categorical column values to numerical values arbitrarily but consistently
df, mapping = categorical_to_numeric(df)
# df = normalize(df)
print(len(df.iloc[0]))
print(df.iloc[0])

In [None]:
# What we are trying to predict
dependent_var = 'SalePrice'

# All independent variables
covariates = df[df.columns[df.columns!=dependent_var]]
# The dependent variable
dependent = df[dependent_var]

In [None]:
regression = linear_model.LinearRegression()
regression.fit(covariates.values, dependent)

test = df.iloc[0]
test = test.drop(dependent_var)

regression.predict([test])

In [None]:
regression.coef_

In [None]:
# Only load train because test doesn't have the SalePrice
df = pd.read_csv('/home/anthony/src/school/cs570/cs570/project3/data/train.csv')
# We don't want to include Id in our regression because it is arbitrary
df.drop('Id', axis=1, inplace=True)
# Nan values cause issues, replace them with 0
df.fillna(0, inplace=True)

# Map categorical column values to numerical values arbitrarily but consistently
df, mapping = categorical_to_numeric(df)

df.to_csv('./data/train_num.csv')
df = pd.read_csv('./data/train_num.csv')
print(len(df.iloc[0]))

In [None]:
# Only load train because test doesn't have the SalePrice
df = pd.read_csv('/home/anthony/src/school/cs570/cs570/project3/data/test.csv')
# We don't want to include Id in our regression because it is arbitrary
df.drop('Id', axis=1, inplace=True)
# Nan values cause issues, replace them with 0
df.fillna(0, inplace=True)

# Map categorical column values to numerical values arbitrarily but consistently
df, mapping = categorical_to_numeric(df)

df.to_csv('./data/test_num.csv')
df = pd.read_csv('./data/test_num.csv')
print(len(df.iloc[0]))

In [None]:
# Only load train because test doesn't have the SalePrice
df = pd.read_csv('/home/anthony/src/school/cs570/cs570/project3/data/train.csv')
# We don't want to include Id in our regression because it is arbitrary
df.drop('Id', axis=1, inplace=True)
# Nan values cause issues, replace them with 0
df.fillna(0, inplace=True)

# Map categorical column values to numerical values arbitrarily but consistently
df, mapping = categorical_to_numeric(df)
df = normalize(df)

df.to_csv('./data/train_norm_num.csv')
df = pd.read_csv('./data/train_norm_num.csv')
print(len(df.iloc[0]))

In [None]:
# Only load train because test doesn't have the SalePrice
df = pd.read_csv('/home/anthony/src/school/cs570/cs570/project3/data/test.csv')
# We don't want to include Id in our regression because it is arbitrary
df.drop('Id', axis=1, inplace=True)
# Nan values cause issues, replace them with 0
df.fillna(0, inplace=True)

# Map categorical column values to numerical values arbitrarily but consistently
df, mapping = categorical_to_numeric(df)
df = normalize(df)

df.to_csv('./data/test_norm_num.csv')
df = pd.read_csv('./data/test_norm_num.csv')
print(len(df.iloc[0]))

In [None]:
import torch

import numpy as np
import pandas as pd

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.hid1 = torch.nn.Linear(79, 61)  # 79-(61-61)-1
        self.hid2 = torch.nn.Linear(61, 61)
        self.oupt = torch.nn.Linear(61, 1)
        torch.nn.init.xavier_uniform_(self.hid1.weight)  # glorot
        torch.nn.init.zeros_(self.hid1.bias)
        torch.nn.init.xavier_uniform_(self.hid2.weight)
        torch.nn.init.zeros_(self.hid2.bias)
        torch.nn.init.xavier_uniform_(self.oupt.weight)
        torch.nn.init.zeros_(self.oupt.bias)

    def forward(self, x):
        z = torch.tanh(self.hid1(x))
        z = torch.tanh(self.hid2(z))
        z = self.oupt(z)  # no activation, aka Identity()
        return z

In [None]:
test_fp = '/home/anthony/src/school/cs570/cs570/project3/data/test_norm_num.csv'
test_x_id = np.loadtxt(test_fp, delimiter=",", skiprows=1, usecols=range(0, 1), dtype=np.float32)
test_x = np.loadtxt(test_fp, delimiter=",", skiprows=1, usecols=range(1, 80), dtype=np.float32)

In [None]:
loaded = Net()
loaded.load_state_dict(torch.load('norm.pth'))
loaded = loaded.eval()

In [None]:
out_fp = './nn_out.csv'
with open(out_fp, 'w') as fh:
    fh.write('Id, SalePrice\n')
    for id, val in zip(test_x_id, test_x):
        pred = loaded(torch.Tensor(val))[0].item()
        unnormalized_pred = unnormalize(df, 'SalePrice', pred)
        fh.write(f'{int(id) + 1461}, {unnormalized_pred}\n')