In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as utils

In [2]:
train = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")

In [3]:
train.shape, test.shape


((1460, 81), (1459, 80))

In [4]:
train.isna().sum().sort_values()[-30:-1]

YearBuilt          0
RoofStyle          0
RoofMatl           0
Exterior1st        0
Exterior2nd        0
ExterQual          0
ExterCond          0
Foundation         0
MSSubClass         0
YearRemodAdd       0
Utilities          0
Electrical         1
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
BsmtFinType2      38
BsmtExposure      38
GarageQual        81
GarageFinish      81
GarageYrBlt       81
GarageType        81
GarageCond        81
LotFrontage      259
FireplaceQu      690
Fence           1179
Alley           1369
MiscFeature     1406
dtype: int64

In [5]:
test.isna().sum().sort_values()[-33:-1]

GarageCars         1
GarageArea         1
KitchenQual        1
Exterior1st        1
SaleType           1
TotalBsmtSF        1
BsmtUnfSF          1
Exterior2nd        1
BsmtFinSF1         1
BsmtFinSF2         1
BsmtFullBath       2
Functional         2
Utilities          2
BsmtHalfBath       2
MSZoning           4
MasVnrArea        15
MasVnrType        16
BsmtFinType2      42
BsmtFinType1      42
BsmtExposure      44
BsmtQual          44
BsmtCond          45
GarageType        76
GarageFinish      78
GarageYrBlt       78
GarageQual        78
GarageCond        78
LotFrontage      227
FireplaceQu      730
Fence           1169
Alley           1352
MiscFeature     1408
dtype: int64

In [6]:
#Filling with "NA" string
for col in ['Alley','FireplaceQu','Fence','MiscFeature','PoolQC','GarageQual','GarageFinish','GarageType','GarageCond','BsmtQual','BsmtCond','BsmtFinType1','BsmtFinType2','BsmtExposure']:
    train[col].fillna('NA', inplace=True)
    test[col].fillna('NA', inplace=True)

In [7]:
train["LotFrontage"].value_counts().to_frame().index[0]

60.0

In [8]:
categorical = ['KitchenQual','Exterior1st','SaleType','Exterior2nd','Functional','Utilities','MSZoning','LotFrontage','Electrical','MasVnrArea','GarageYrBlt']
for i in categorical:
    train[i].fillna(train[i].value_counts().to_frame().index[0], inplace=True)
    test[i].fillna(test[i].value_counts().to_frame().index[0], inplace=True)
median_values_col=['GarageArea','GarageCars','BsmtFinSF1','TotalBsmtSF','BsmtFinSF2','BsmtFullBath','BsmtHalfBath','BsmtUnfSF']
for i in median_values_col:
    test[i].fillna(test[i].value_counts().to_frame().index[0], inplace=True)

In [9]:
print(train.isna().sum().sort_values()[-10:-1])
print(test.isna().sum().sort_values()[-10:-1])

ExterCond      0
ExterQual      0
MasVnrArea     0
Exterior2nd    0
Exterior1st    0
RoofMatl       0
RoofStyle      0
Heating        0
SalePrice      0
dtype: int64
ExterCond        0
ExterQual        0
MasVnrArea       0
Exterior2nd      0
Exterior1st      0
RoofMatl         0
RoofStyle        0
SaleType         0
SaleCondition    0
dtype: int64


In [10]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [12]:
train_num= train.values
test_num= test.values
labels=train_num[:,-1]
print("Maximum value is ",labels.max())
train.drop(['SalePrice','Id'],axis=1, inplace=True)
test.drop('Id',axis=1, inplace=True)

Maximum value is  755000


In [13]:
col_groups = train.columns.to_series().groupby(train.dtypes).groups
non_numeric_cols = col_groups[np.dtype('O')]

In [14]:
len(non_numeric_cols)

43

In [15]:
for col in non_numeric_cols:
    one_hot = pd.get_dummies(train[col],prefix=str(col+"_"))
    train = train.drop(col,axis=1)
    train = train.join(one_hot)
train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType__ConLw,SaleType__New,SaleType__Oth,SaleType__WD,SaleCondition__Abnorml,SaleCondition__AdjLand,SaleCondition__Alloca,SaleCondition__Family,SaleCondition__Normal,SaleCondition__Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [16]:
col_groups = test.columns.to_series().groupby(test.dtypes).groups
non_numeric_cols = col_groups[np.dtype('O')]

In [17]:
len(non_numeric_cols)

43

In [18]:
for col in non_numeric_cols:
    one_hot = pd.get_dummies(test[col],prefix=str(col+"_"))
    test = test.drop(col,axis=1)
    test = test.join(one_hot)
test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType__ConLw,SaleType__New,SaleType__Oth,SaleType__WD,SaleCondition__Abnorml,SaleCondition__AdjLand,SaleCondition__Alloca,SaleCondition__Family,SaleCondition__Normal,SaleCondition__Partial
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,1,0,0,0,0,1,0
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0,0,0,1,0,0,0,0,1,0
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [19]:
for i in train.columns:
    if i not in test.columns:
        print("Index is ",train.columns.get_loc(i))
        index= train.columns.get_loc(i)
        test.insert(index,i,0)
        #test[i]=0
        print(i)

Index is  55
Utilities__NoSeWa
Index is  103
Condition2__RRAe
Index is  104
Condition2__RRAn
Index is  105
Condition2__RRNn
Index is  114
HouseStyle__2.5Fin
Index is  125
RoofMatl__ClyTile
Index is  127
RoofMatl__Membran
Index is  128
RoofMatl__Metal
Index is  129
RoofMatl__Roll
Index is  140
Exterior1st__ImStucc
Index is  143
Exterior1st__Stone
Index is  157
Exterior2nd__Other
Index is  212
Heating__Floor
Index is  216
Heating__OthW
Index is  228
Electrical__Mix
Index is  258
GarageQual__Ex
Index is  274
PoolQC__Fa
Index is  286
MiscFeature__TenC


In [20]:
test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType__ConLw,SaleType__New,SaleType__Oth,SaleType__WD,SaleCondition__Abnorml,SaleCondition__AdjLand,SaleCondition__Alloca,SaleCondition__Family,SaleCondition__Normal,SaleCondition__Partial
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,1,0,0,0,0,1,0
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0,0,0,1,0,0,0,0,1,0
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [21]:
#train = (train- train.mean())/ train.std()
x_train = train.values

In [22]:
x_train= x_train.astype('float64')

In [23]:
x_train= torch.Tensor(x_train)

In [24]:
labels= labels.astype('float64')

In [25]:
labels= torch.Tensor(labels)

In [26]:
#test =(test-test.mean())/test.std()
x_test= test.values
x_test= x_test.astype('float64')
x_test = torch.Tensor(x_test)

In [27]:
class Net(nn.Module):
    def __init__(self,size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(size,1500)
        self.fc6 = nn.Linear(1500,1)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc6(x)
        return x

In [28]:
x_train.shape, labels.shape

(torch.Size([1460, 302]), torch.Size([1460]))

In [29]:
x_test.shape

torch.Size([1459, 302])

In [30]:
dataset= utils.TensorDataset(x_train,labels)
dataLoader = utils.DataLoader(dataset)

In [31]:
model = Net(x_train.shape[1])

In [32]:
model

Net(
  (fc1): Linear(in_features=302, out_features=1500, bias=True)
  (fc6): Linear(in_features=1500, out_features=1, bias=True)
)

In [33]:
criteria= nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
#Adam(model.parameters(),lr=0.001)

In [34]:
def trainingModel(dataLoader):
    for epoch in range(150):
        ave_loss=0.0
        print("Epoch ",epoch+1)
        for i,data in enumerate(dataLoader,0):
            inputs,target =data
            target = target.float()
            optimizer.zero_grad()
            output=model(inputs)
            loss=criteria(output,target)
            loss.backward()
            optimizer.step()
            ave_loss+=loss.item()
            if i%200==0:    # print every 200 mini-batches
                print('[%d, %5d] loss: %.3f' %
                     (epoch + 1, i + 1, ave_loss /200))
                ave_loss = 0.0

In [None]:
trainingModel(dataLoader)

Epoch  1
[1,     1] loss: 217127628.800


  return F.mse_loss(input, target, reduction=self.reduction)


[1,   201] loss: 12739258202.718
[1,   401] loss: 18508167770.113
[1,   601] loss: 4653725172.759
[1,   801] loss: 4878459313.491
[1,  1001] loss: 3309578982.377
[1,  1201] loss: 3768342419.049
[1,  1401] loss: 4105570573.687
Epoch  2
[2,     1] loss: 4036337.600
[2,   201] loss: 2384399653.772
[2,   401] loss: 4139714851.818
[2,   601] loss: 2989633673.356
[2,   801] loss: 3050556528.401
[2,  1001] loss: 2252251857.437
[2,  1201] loss: 2570621013.200
[2,  1401] loss: 3715227876.790
Epoch  3
[3,     1] loss: 735362.320
[3,   201] loss: 1740046408.356
[3,   401] loss: 1652013678.329
[3,   601] loss: 2410977790.067
[3,   801] loss: 2155791171.909
[3,  1001] loss: 1774339756.857


In [None]:
predictions = model(x_test)

In [None]:
predictions_array = predictions.data.cpu().numpy()

In [None]:
type(predictions_array)

In [None]:
test_data = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")
test_df=pd.DataFrame(columns=['Id','SalePrice'])
test_df["Id"]= test_data["Id"]
test_df["SalePrice"]= predictions_array
test_df[["Id","SalePrice"]].to_csv("submission.csv",index=False)