In [1736]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datacleaner import autoclean
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.decomposition import PCA

In [1737]:
# Read in data
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')
#print (len(data.columns.values))

In [1738]:
# Clean data
Id = data_test.Id
data = data.drop(['Id'], axis=1)
data_test = data_test.drop(['Id'], axis=1)
#print (data.info())
#print (data.head())

In [1739]:
# Clean MSSubClass, add feature for PUD
data['PUD'] = data.MSSubClass
data.ix[[x not in [120, 150, 160, 180] for x in data.PUD], 'PUD'] = 0
data.ix[[x in [120, 150, 160, 180] for x in data.PUD], 'PUD'] = 1
data = data.drop(['MSSubClass'], axis=1)

data_test['PUD'] = data_test.MSSubClass
data_test.ix[[x not in [120, 150, 160, 180] for x in data_test.PUD], 'PUD'] = 0
data_test.ix[[x in [120, 150, 160, 180] for x in data_test.PUD], 'PUD'] = 1
data_test = data_test.drop(['MSSubClass'], axis=1)

In [1740]:
# Clean Condition1 and 2
data['RoadProx'] = data.Condition1
data['RailProx'] = data.Condition1
data['FeatProx'] = data.Condition1

roadind = [x in ['Artery', 'Feedr', 'Norm'] for x in data.Condition1] or [x in ['Artery', 'Feedr', 'Norm'] for x in data.Condition2]
data.ix[np.logical_not(roadind), 'RoadProx'] = 0
data.ix[roadind, 'RoadProx'] = 1

railind = [x in ['RRNn', 'RRAn', 'RRNe', 'RRAe'] for x in data.Condition1] or [x in ['RRNn', 'RRAn', 'RRNe', 'RRAe'] for x in data.Condition2]
data.ix[np.logical_not(railind), 'RailProx'] = 0
data.ix[railind, 'RailProx'] = 1

featind = [x in ['PosN', 'PosA'] for x in data.Condition1] or [x in ['PosN', 'PosA'] for x in data.Condition2]
data.ix[np.logical_not(featind), 'FeatProx'] = 0
data.ix[featind, 'FeatProx'] = 1

data = data.drop(['Condition1', 'Condition2'], axis=1)

data_test['RoadProx'] = data_test.Condition1
data_test['RailProx'] = data_test.Condition1
data_test['FeatProx'] = data_test.Condition1

roadind = [x in ['Artery', 'Feedr', 'Norm'] for x in data_test.Condition1] or [x in ['Artery', 'Feedr', 'Norm'] for x in data_test.Condition2]
data_test.ix[np.logical_not(roadind), 'RoadProx'] = 0
data_test.ix[roadind, 'RoadProx'] = 1

railind = [x in ['RRNn', 'RRAn', 'RRNe', 'RRAe'] for x in data_test.Condition1] or [x in ['RRNn', 'RRAn', 'RRNe', 'RRAe'] for x in data_test.Condition2]
data_test.ix[np.logical_not(railind), 'RailProx'] = 0
data_test.ix[railind, 'RailProx'] = 1

featind = [x in ['PosN', 'PosA'] for x in data_test.Condition1] or [x in ['PosN', 'PosA'] for x in data_test.Condition2]
data_test.ix[np.logical_not(featind), 'FeatProx'] = 0
data_test.ix[featind, 'FeatProx'] = 1

data_test = data_test.drop(['Condition1', 'Condition2'], axis=1)

In [1741]:
# Clean square footage and num baths
data['SqFeet'] = data['1stFlrSF'] + data['2ndFlrSF'] + data.GrLivArea
data['Nbaths'] = data.BsmtFullBath + 0.5*data.BsmtHalfBath + data.FullBath + 0.5*data.HalfBath
data = data.drop(['1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], axis=1)

data_test['SqFeet'] = data_test['1stFlrSF'] + data_test['2ndFlrSF'] + data_test.GrLivArea
data_test['Nbaths'] = data_test.BsmtFullBath + 0.5*data_test.BsmtHalfBath + data_test.FullBath + 0.5*data_test.HalfBath
data_test = data_test.drop(['1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], axis=1)

In [1742]:
# Merge porch sqft
data['PorchSF'] = data.WoodDeckSF + data.OpenPorchSF + data.EnclosedPorch + data['3SsnPorch'] + data.ScreenPorch
data = data.drop(['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis=1)

data_test['PorchSF'] = data_test.WoodDeckSF + data_test.OpenPorchSF + data_test.EnclosedPorch + data_test['3SsnPorch'] + data_test.ScreenPorch
data_test = data_test.drop(['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis=1)

In [1743]:
print (data.BsmtFinSF2)

0          0
1          0
2          0
3          0
4          0
5          0
6          0
7         32
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24       668
25         0
26       486
27         0
28         0
29         0
        ... 
1430       0
1431       0
1432       0
1433       0
1434       0
1435       0
1436       0
1437       0
1438       0
1439     110
1440       0
1441       0
1442       0
1443       0
1444       0
1445     627
1446       0
1447       0
1448       0
1449       0
1450       0
1451       0
1452       0
1453       0
1454       0
1455       0
1456     163
1457       0
1458    1029
1459     290
Name: BsmtFinSF2, dtype: int64


In [1744]:
# Merge Bsmt sqft
data['BsmtSF'] = data.BsmtFinSF1 + data.BsmtFinSF2
data = data.drop(['BsmtFinSF1', 'BsmtFinSF2'], axis=1)

data_test['BsmtSF'] = data_test.BsmtFinSF1 + data_test.BsmtFinSF2
data_test = data_test.drop(['BsmtFinSF1', 'BsmtFinSF2'], axis=1)

In [1745]:
# Clean MiscFeature
#data['HasElev'] = data.MiscFeature
#data['HasShed'] = data.MiscFeature
#data['HasTen']  = data.MiscFeature

#elevind = [x == 'Elev' for x in data.MiscFeature]
#data.ix[np.logical_not(elevind), 'HasElev'] = 0
#data.ix[elevind, 'HasElev'] = 1

#shedind = [x == 'Shed' for x in data.MiscFeature]
#data.ix[np.logical_not(shedind), 'HasShed'] = 0
#data.ix[shedind, 'HasShed'] = 1

#tenind = [x == 'TenC' for x in data.MiscFeature]
#data.ix[np.logical_not(tenind), 'HasTen'] = 0
#data.ix[tenind, 'HasTen'] = 1

#data = data.drop(['MiscFeature'], axis=1)
#data_test = data_test.drop(['MiscFeature'], axis=1)

In [1746]:
#print (data.info())

In [1747]:
# Format data
data = autoclean(data)
data = data.astype('int64')

data_test = autoclean(data_test)
data_test = data_test.astype('int64')

In [1748]:
print (len(data.columns.values))
print (len(data_test.columns.values))

71
70


In [1749]:
corr = data.corr()
corr_sale = corr.SalePrice.drop(['SalePrice'])
#print (np.abs(corr_sale).sort_values(ascending=False))
kill = corr_sale.ix[corr_sale < 0.25].to_dict()
#for name, val in kill.items():
#  data = data.drop([name], axis=1)

In [1750]:
# Guess and check new features
#data['Guess1'] = data.OverallQual * data.SqFeet
#data['Guess2'] = data.BsmtQual * data.TotalBsmtSF
#data['Guess3'] = data.OverallCond * data.SqFeet
#data['Guess4'] = data.BsmtCond * data.BsmtSF
#data['Guess5'] = data.BsmtFinType1 * data.BsmtFinType2 * data.BsmtSF
#data['Guess6'] = data.Heating * data.HeatingQC
#data['Guess7'] = data.Functional * data.SqFeet
#data['Guess8'] = data.Fireplaces * data.FireplaceQu
data['Guess9'] = data.OverallQual * data.OverallCond
data['Guess10'] = data.YearBuilt * data.BsmtSF
data['Guess11'] = data.YearBuilt * data.SqFeet
data['Guess12'] = data.YearBuilt * data.Nbaths
data['Guess13'] = data.TotRmsAbvGrd * data.BedroomAbvGr
data['Guess14'] = data.TotalBsmtSF * data.SqFeet
data['Guess15'] = data.YearBuilt * data.YearRemodAdd
data['Guess16'] = data.SqFeet * data.LotFrontage
#data['Guess8'] = data.ExterQual * data.Exterior1st
#data['Guess9'] = data.ExterCond * data.Exterior1st
#data['Guess8'] = data.PoolQC * data.PoolArea
#data['Guess9'] = data.GarageCond * data.GarageArea
#data = data.drop(['OverallQual', 'SqFeet', 'BsmtQual', 'TotalBsmtSF'], axis=1)


data_test['Guess1'] = data_test.OverallQual * data_test.SqFeet
data_test['Guess2'] = data_test.BsmtQual * data_test.TotalBsmtSF
data_test['Guess3'] = data_test.OverallCond * data_test.SqFeet
data_test['Guess4'] = data_test.BsmtCond * data_test.BsmtSF
data_test['Guess5'] = data_test.BsmtFinType1 * data_test.BsmtFinType2 * data_test.BsmtSF
data_test['Guess6'] = data_test.Heating * data_test.HeatingQC
data_test['Guess7'] = data_test.Functional * data_test.SqFeet
data_test['Guess8'] = data_test.Fireplaces * data_test.FireplaceQu

In [1751]:
Y = data.SalePrice
X = data.drop(['SalePrice'], axis=1)
#print (data.head(100))
#rint (Y)
#print (X.head())

In [1752]:
# Regularize
#last = 'Guess2'
#data.ix[:,:last] = (data.ix[:,:last] - data.ix[:,:last].mean()) / data.ix[:,:last].std()
X = (X - X.mean())
#X = (X - X.mean()) / X.std()

In [1753]:
#print (X.head())

In [1754]:
# Split into test and train sets
#X = data[['MSSubClass', 'LotArea', 'Neighborhood', 'HouseStyle', 'FullBath', 'BedroomAbvGr']].as_matrix()
#Y = data.SalePrice.values
#X = data.drop(['SalePrice'], axis=1).as_matrix()
Y = Y.values
X = X.as_matrix()
kf = KFold(n_splits=2)
#for train_index, test_index in kf.split(X):
#  X_train, X_test = X[train_index], X[test_index]
#  Y_train, Y_test = Y[train_index], Y[test_index]
X_train = X
Y_train = Y
X_test = (data_test - data_test.mean()).as_matrix()

In [1755]:
'''
# PCA
pca=PCA()
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)

n = len(Y_test)
cov = np.dot(X_test.T, X_test) / n
eigenvalues = pca.explained_variance_
eigenvectors = pca.components_
#for v in eigenvectors:
#    print(np.dot(v.T, np.dot(cov, v)))
#print (eigenvalues)
#print (eigenvectors)
ind = eigenvalues >= 1
dum = []
for i in range(0, len(X_test)):
  dum.append(X_test[i][:31])
print (len(dum[0][:31]))
X_test = dum
dum = []
for i in range(0, len(X_train)):
    dum.append(X_train[i][:31])
X_train = dum
#X_train = X_train[ind]
#for eigenvalue, eigenvector in zip(eigenvalues, eigenvectors):    
#    print('1 ' + str(np.dot(eigenvector.T, np.dot(cov, eigenvector))))
#    print('2 ' + str(eigenvalue))
#print (sorted(eigenvalues))
'''

"\n# PCA\npca=PCA()\nX_train=pca.fit_transform(X_train)\nX_test=pca.transform(X_test)\n\nn = len(Y_test)\ncov = np.dot(X_test.T, X_test) / n\neigenvalues = pca.explained_variance_\neigenvectors = pca.components_\n#for v in eigenvectors:\n#    print(np.dot(v.T, np.dot(cov, v)))\n#print (eigenvalues)\n#print (eigenvectors)\nind = eigenvalues >= 1\ndum = []\nfor i in range(0, len(X_test)):\n  dum.append(X_test[i][:31])\nprint (len(dum[0][:31]))\nX_test = dum\ndum = []\nfor i in range(0, len(X_train)):\n    dum.append(X_train[i][:31])\nX_train = dum\n#X_train = X_train[ind]\n#for eigenvalue, eigenvector in zip(eigenvalues, eigenvectors):    \n#    print('1 ' + str(np.dot(eigenvector.T, np.dot(cov, eigenvector))))\n#    print('2 ' + str(eigenvalue))\n#print (sorted(eigenvalues))\n"

In [1756]:
# RMS log error
def RMSlog(p, a):
  n = len(a)
  return (np.sqrt(sum([(np.log(p[i] + 1) - np.log(a[i] + 1))**2 for i in range(0, n)]) / n))

In [1757]:
# Decision tree
#_DT = []
#for i in range(0, 9):
#  clf = tree.DecisionTreeRegressor()
#  clf = clf.fit(X_train, Y_train)
#  Y_pred_DT = clf.predict(X_test)
#  _DT.append(clf.score(X_test, Y_test))
#print (str(np.mean(_DT)) + ' +/- ' + str(np.std(_DT)))

In [1758]:
# Random forest, turn off reg and kill improves ~0.1%
_RF = []
_RMS = []
for i in range(0, 9):
  random_forest = RandomForestRegressor(n_estimators=1000)
  random_forest.fit(X_train, Y_train)
  Y_pred_RF = random_forest.predict(X_test)
#  _RF.append(random_forest.score(X_test, Y_test))
#  _RMS.append(RMSlog(Y_pred_RF, Y_test))
#print (str(np.mean(_RF)) + ' +/- ' + str(np.std(_RF)))
#print (str(np.mean(_RMS)) + ' +/- ' + str(np.std(_RMS)))

In [1759]:
# Boosted tree
#_BDT = []
#for i in range(0, 9):
#  boost = AdaBoostRegressor()
#  boost.fit(X_train, Y_train)
#  Y_pred_BDT = boost.predict(X_test)
#  _BDT.append(boost.score(X_test, Y_test))
#print (str(np.mean(_BDT)) + ' +/- ' + str(np.std(_BDT)))

In [1760]:
0.139251094605 

0.139251094605

In [1761]:
0.1407688538

0.1407688538

In [1762]:
0.142228471831

0.142228471831

In [1763]:
0.146138156399

0.146138156399

In [1764]:
submission = pd.DataFrame({
        "Id": Id,
        "SalePrice": Y_pred_RF
    })
submission.to_csv('HousePrices.csv', index=False)