In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as utils
from sklearn import preprocessing


In [2]:
def trainingModel(dataLoader):
    for epoch in range(210):
        ave_loss=0.0
        print("Epoch ",epoch+1)
        for i,data in enumerate(dataLoader,0):
            inputs,target =data
            target = target.float()
            optimizer.zero_grad()
            output=model(inputs)
            loss=criteria(output,target)
            loss.backward()
            optimizer.step()
            ave_loss+=loss.item()
        print("Loss for Epoch ",epoch+1," is ",(ave_loss/x_train.shape[0]))

In [3]:
def dataPreProcessing(dataframe):
    print("Number of NAN columns before preprocessing\n",dataframe.isna().sum().sort_values()[-30:-1]) # prints the top 35 nan valued columns
    
    #Filling with "NA" string
    for col in ['Alley','FireplaceQu','Fence','MiscFeature','PoolQC','GarageQual','GarageFinish','GarageType','GarageCond','BsmtQual','BsmtCond','BsmtFinType1','BsmtFinType2','BsmtExposure']:
        dataframe[col].fillna('NA', inplace=True)
    
    fill_avg = ['KitchenQual','Exterior1st','SaleType','Exterior2nd','Functional','Utilities','MSZoning','LotFrontage','Electrical','MasVnrArea','GarageYrBlt']
    for i in fill_avg:
        dataframe[i].fillna(dataframe[i].value_counts().to_frame().index[0], inplace=True)
    
    median_values_col=['GarageArea','GarageCars','BsmtFinSF1','TotalBsmtSF','BsmtFinSF2','BsmtFullBath','BsmtHalfBath','BsmtUnfSF']
    for i in median_values_col: # did it separately as the test data had nan in it.. didnt find it at training dataframe
        dataframe[i].fillna(dataframe[i].value_counts().to_frame().index[0], inplace=True)
        
    print("Number of NAN columns after preprocessing\n",dataframe.isna().sum().sort_values()[-10:-1])
    
    return dataframe
    

In [4]:
def performOnehotEncoding(dataframe):
    col_groups = dataframe.columns.to_series().groupby(dataframe.dtypes).groups
    non_numeric_cols = col_groups[np.dtype('O')]
    for col in non_numeric_cols:
        one_hot = pd.get_dummies(dataframe[col],prefix=str(col+"_"))
        dataframe = dataframe.drop(col,axis=1)
        dataframe = dataframe.join(one_hot)
    return dataframe
#train.head()

In [5]:
def addColumnsFromTraintoTest(train,test):
    for i in train.columns:
        if i not in test.columns:
            index= train.columns.get_loc(i)
            test.insert(index,i,0)
    return test
    

In [6]:
def changeToTensor(dataFrame):
    x = dataFrame.values
    x= x.astype('float64')
    x= torch.Tensor(x)
    return x

In [7]:
train = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")

In [8]:
train.shape, test.shape

((1460, 81), (1459, 80))

In [9]:
train = dataPreProcessing(train)

Number of NAN columns before preprocessing
 YearBuilt          0
RoofStyle          0
RoofMatl           0
Exterior1st        0
Exterior2nd        0
ExterQual          0
ExterCond          0
Foundation         0
MSSubClass         0
YearRemodAdd       0
Utilities          0
Electrical         1
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
BsmtFinType2      38
BsmtExposure      38
GarageQual        81
GarageFinish      81
GarageYrBlt       81
GarageType        81
GarageCond        81
LotFrontage      259
FireplaceQu      690
Fence           1179
Alley           1369
MiscFeature     1406
dtype: int64
Number of NAN columns after preprocessing
 ExterCond      0
ExterQual      0
MasVnrArea     0
Exterior2nd    0
Exterior1st    0
RoofMatl       0
RoofStyle      0
Heating        0
SalePrice      0
dtype: int64


In [10]:
test = dataPreProcessing(test)

Number of NAN columns before preprocessing
 Exterior1st        1
SaleType           1
TotalBsmtSF        1
BsmtUnfSF          1
Exterior2nd        1
BsmtFinSF1         1
BsmtFinSF2         1
BsmtFullBath       2
Functional         2
Utilities          2
BsmtHalfBath       2
MSZoning           4
MasVnrArea        15
MasVnrType        16
BsmtFinType2      42
BsmtFinType1      42
BsmtExposure      44
BsmtQual          44
BsmtCond          45
GarageType        76
GarageFinish      78
GarageYrBlt       78
GarageQual        78
GarageCond        78
LotFrontage      227
FireplaceQu      730
Fence           1169
Alley           1352
MiscFeature     1408
dtype: int64
Number of NAN columns after preprocessing
 ExterCond        0
ExterQual        0
MasVnrArea       0
Exterior2nd      0
Exterior1st      0
RoofMatl         0
RoofStyle        0
SaleType         0
SaleCondition    0
dtype: int64


In [11]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [12]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [13]:
train_num= train.values
test_num= test.values
labels=train_num[:,-1]
train.drop(['SalePrice','Id'],axis=1, inplace=True)
test.drop('Id',axis=1, inplace=True)

In [14]:
train = performOnehotEncoding(train)

In [15]:
test = performOnehotEncoding(test)

In [16]:
test = addColumnsFromTraintoTest(train,test)

In [17]:
x_train = changeToTensor(train)
labels= labels.astype('float64')
labels= torch.Tensor(labels)
x_test = changeToTensor(test)

In [18]:
class Net(nn.Module):
    def __init__(self,size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(size,2100)
        self.fc6 = nn.Linear(2100,1)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc6(x)
        return x

In [19]:
x_train.shape, labels.shape

(torch.Size([1460, 302]), torch.Size([1460]))

In [20]:
x_test.shape

torch.Size([1459, 302])

In [21]:
dataset= utils.TensorDataset(x_train,labels)
dataLoader = utils.DataLoader(dataset)

In [22]:
model = Net(x_train.shape[1])

In [23]:
model

Net(
  (fc1): Linear(in_features=302, out_features=2100, bias=True)
  (fc6): Linear(in_features=2100, out_features=1, bias=True)
)

In [24]:
criteria= nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

In [None]:
trainingModel(dataLoader)

Epoch  1


  return F.mse_loss(input, target, reduction=self.reduction)


Loss for Epoch  1  is  4704944066.213118
Epoch  2
Loss for Epoch  2  is  2832246748.619667
Epoch  3
Loss for Epoch  3  is  2351835859.7216773
Epoch  4
Loss for Epoch  4  is  2322418726.571846
Epoch  5
Loss for Epoch  5  is  2185881800.262031
Epoch  6
Loss for Epoch  6  is  2203680548.7688227
Epoch  7
Loss for Epoch  7  is  1996271729.469112
Epoch  8
Loss for Epoch  8  is  1975996964.7395701
Epoch  9
Loss for Epoch  9  is  1939780905.9063623
Epoch  10
Loss for Epoch  10  is  1910755043.8952885
Epoch  11
Loss for Epoch  11  is  1825137328.907407
Epoch  12
Loss for Epoch  12  is  1790019288.0474408
Epoch  13
Loss for Epoch  13  is  1714130919.558666
Epoch  14
Loss for Epoch  14  is  1691095889.4865048
Epoch  15
Loss for Epoch  15  is  1617440479.6626027
Epoch  16
Loss for Epoch  16  is  1561376121.1161933
Epoch  17
Loss for Epoch  17  is  1610692571.3757746
Epoch  18
Loss for Epoch  18  is  1493219129.8025706
Epoch  19
Loss for Epoch  19  is  1543641320.8200576
Epoch  20
Loss for Epoch  2

In [None]:
predictions = model(x_test)

In [None]:
predictions_array = predictions.data.cpu().numpy()

In [None]:
test_data = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")
test_df=pd.DataFrame(columns=['Id','SalePrice'])
test_df["Id"]= test_data["Id"]
test_df["SalePrice"]= predictions_array
test_df[["Id","SalePrice"]].to_csv("submission.csv",index=False)