In [1]:
import pandas as pd
from sklearn import linear_model

In [2]:
def categorical_to_numeric(df):
    """Takes a dataframe and for every non-numeric column in the dataframe it
    maps each observed value in that column to a unique numerical value. It
    returns the dataframe mutated by the mapping and the mapping
    """
    mapping = {}

    for column in df.columns:
        if not pd.api.types.is_numeric_dtype(df[column]):
            counter = 0
            mapping[column] = {}

            for value in df[column]:
                if value in mapping[column].keys():
                    continue

                mapping[column][value] = counter
                counter += 1

            df = df.replace({column: mapping[column]})

    return df, mapping


In [3]:
# Only load train because test doesn't have the SalePrice
df = pd.read_csv('/home/anthony/src/school/cs570/cs570/project3/data/train.csv')
# We don't want to include Id in our regression because it is arbitrary
df.drop('Id', axis=1, inplace=True)
# Nan values cause issues, replace them with 0
df.fillna(0, inplace=True)

# Map categorical column values to numerical values arbitrarily but consistently
df, mapping = categorical_to_numeric(df)

In [4]:
# What we are trying to predict
dependent_var = 'SalePrice'

# All independent variables
covariates = df[df.columns[df.columns!=dependent_var]]
# The dependent variable
dependent = df[dependent_var]

In [6]:
regression = linear_model.LinearRegression()
regression.fit(covariates.values, dependent)

test = df.iloc[0]
test = test.drop(dependent_var)

regression.predict([test])

array([193094.14894694])

In [7]:
regression.coef_

array([-7.00903429e+01,  2.72815397e+02, -1.18046467e+01,  3.96045861e-01,
       -3.56444582e+04, -1.53662812e+03,  1.55452430e+03,  2.24710847e+03,
       -5.34112889e+04,  9.26795758e+02,  5.17288132e+03, -3.12230682e+02,
       -2.54209841e+03, -9.65289388e+03, -2.33664009e+03,  5.71280907e+02,
        1.37597157e+04,  6.67869590e+03,  4.26748627e+02,  1.19346766e+02,
        4.66293306e+03, -7.78316465e+03, -6.21479615e+02,  5.63760918e+02,
        3.76787164e+03,  2.85207938e+01,  3.28366716e+03,  3.56808491e+00,
       -1.60032888e+03,  6.34423745e+03, -1.24962875e+03,  2.12714630e+03,
       -7.85453820e+02,  7.40808345e+00, -1.34214167e+03,  1.08215593e+01,
       -1.68019252e+00,  1.65494519e+01,  1.78133117e+02, -2.40554841e+03,
        4.83173980e+03,  1.94844474e+03,  2.34465039e+01,  2.38550318e+01,
       -2.22298221e+01,  2.50717142e+01,  7.08758006e+03, -2.05583190e+03,
        5.99631392e+03,  8.61358203e+02, -7.71956934e+03, -2.18144383e+04,
        7.08838028e+03,  