### Using manual feature-map

In [None]:
from collections import defaultdict
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

def remapOnFeatureMap2(data, featureMap2):
    remapOnFeatureMap2 = []
    
    for i, row in enumerate(data):
        newRow = []
        for j, d in enumerate(row):
            if j not in [0,7]:
                if (j,d) in featureMap2:
                    newRow.append(featureMap2[(j,d)])
        remapOnFeatureMap2.append(newRow)
    
    binRemapped2 = np.zeros((len(data), len(featureMap2)), dtype=float)
    for i, row in enumerate(remapOnFeatureMap2):
        for j, d in enumerate(row):
            if j not in [0,7]:
                binRemapped2[i][d] = 1
            else:
                binRemapped2[i][j] = d
        binRemapped2[i][-1] = 1
    return binRemapped2

def removeNA(data):
    print(len(data))
    for i, row in enumerate(data):
        for j, d in enumerate(row):
            if d=='NA':
                data[i][j] = 0
    return data

if __name__ == '__main__':
    trainFile, devFile, testFile = 'train.csv', 'my_dev.csv', 'test.csv'
    
    train = np.array([s.strip().split(',')[1:-1] for s in open(trainFile).readlines()][1:])
    dev = np.array([s.strip().split(',')[1:-1] for s in open(devFile).readlines()][1:])
    test = np.array([s.strip().split(',')[1:] for s in open(testFile).readlines()][1:])
    
    trainIDs = [s.strip().split(',')[0] for s in open(trainFile).readlines()][1:]
    devIDs = [s.strip().split(',')[0] for s in open(devFile).readlines()][1:]
    
#     trainY = map(float, [s.strip().split(',')[-1] for s in open(trainFile).readlines()][1:])
    trainY = map(lambda s : np.log(float(s)), [s.strip().split(',')[-1] for s in open(trainFile).readlines()][1:])
    trainY = np.array(list(trainY))
#     devY = map(float, [s.strip().split(',')[-1] for s in open(devFile).readlines()][1:])
    devY = map(lambda s : np.log(float(s)), [s.strip().split(',')[-1] for s in open(devFile).readlines()][1:])
    devY = np.array(list(devY))

    train = removeNA(train)
    dev = removeNA(dev)
    
    featureMap = defaultdict()
    for i, row in enumerate(train):
        for j, d in enumerate(row):
            feature = (j,d)
            if feature not in featureMap:
                featureMap[feature] = len(featureMap)
    
    print(len(featureMap))
    
#     print(featureMap)
    
    binTrain = remapOnFeatureMap2(train, featureMap)
    binDev = remapOnFeatureMap2(dev, featureMap)
    
    model = LinearRegression().fit(X=binTrain, y=trainY)
    preds = model.predict(binDev)
#     print('type of preds: {} type of devY: {}'.format(type(preds), type(devY)))
#     print('devY and preds: {}'.format(list(zip(devY, preds))))
#     a = mean_squared_error(devY, preds)
    print('mse: {}'.format(mean_squared_error(devY, preds)))
    preds = [np.exp(s) for s in preds]
#     print('rmse: {}'.format(mean_squared_log_error(devY, preds)))
    print('predictions after exponentiation: {}'.format(preds))

    with open('submission.csv', 'w') as output:
        output.write('Id,SalePrice')
        for i,p in zip(devIDs, preds):
            output.write(str(i)+','+str(p))
            print(str(i)+','+str(p))

### Using pandas and numpy to create feature-map

In [61]:
from collections import defaultdict
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

def getBinarizedData(data):
    numFeatures = data.select_dtypes(np.number).fillna(0)
    print(numFeatures.columns)
#     print(len(numFeatures.columns))
    
    otherFeatures = data.select_dtypes(exclude=np.number).fillna(0)
    print('Other features cols: {}'.format(otherFeatures.columns))
#     print(len(otherFeatures.columns))
    
#     Getting unique values of all non-numeric columns with their corresponding counts
#     print([(train[col].unique(), train[col].nunique()) for col in otherFeatures.columns])

    binData = pd.concat([pd.get_dummies(data[otherFeatures.columns]), data[numFeatures.columns]], axis = 1).fillna(0)
    print('Shape of binarized dataframe: {}'.format(binData.shape))
#     print(binData.head())

    return binData

if __name__ == '__main__':    
    train, dev = pd.read_csv('train.csv'), pd.read_csv('my_dev.csv')
    
    trainY, devY = train.SalePrice.astype('float64'), dev.SalePrice.astype('float64')
    
    train = train.drop(['Id', 'SalePrice'], axis = 1)
    dev = dev.drop(['Id', 'SalePrice'], axis = 1)
    
#     print(train.columns)
#     print(dev.columns)
    
    print('train shape: {}'.format(train.shape))
    print('dev shape: {}'.format(dev.shape))
    
    # To display the dtypes of all columns
#     print(train.dtypes)
    
    binTrain = getBinarizedData(train).astype('float64')
    binDev = getBinarizedData(dev).astype('float64')
#     print(binTrain.dtypes)

    for i, col in enumerate(binTrain.columns):
        isnotavailable = np.any(np.isnan(binTrain[col]))
        if isnotavailable:
            print('isnan is true for col {} and values are {}'.format(col, binTrain[col]))
#         else:
#             print('col - {} nan - {}'.format(col, isnotavailable))
#         print('col - {} inf - {}'.format(col, np.any(np.isfinite(binTrain[col]))))
#         print('col - {} float - {}'.format(col, np.issubdtype(binTrain[col], float)))
#         print('{}: max val in col "{}" is {} '.format(i, col, max(binTrain[col])))
#         print('{}) are values for {} in valid float64 range? {}'.format(i, col, np.any(list(True if val < np.finfo(np.float).max else False for val in binTrain[col]))))

#     for s in trainY:
#         print(trainY[np.isnan(s)==False])
#     print('max value of trainY: {}'.format(max(trainY)))
    
    reg = LinearRegression().fit(binTrain, trainY)
    print('shape of binDev: {}'.format(trainY.shape))
    targetPred = reg.predict(binDev)
    print(mean_squared_log_error(devY, targetPred))

train shape: (1460, 79)
dev shape: (146, 79)
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')
Other features cols: Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'Bsm

ValueError: shapes (146,225) and (288,) not aligned: 225 (dim 1) != 288 (dim 0)