In [153]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV

In [142]:
import sklearn.preprocessing as pp
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor

In [89]:
df_train = pd.read_csv('/home/voshkanov/house-prices-datasets/train.csv', index_col='Id')



In [90]:
df_test = pd.read_csv('/home/voshkanov/house-prices-datasets/test.csv', index_col='Id')

In [91]:
df_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [92]:
for elem in df_train.columns:
    if df_train[elem].isnull().values.any():
        df_train[elem].fillna(df_train[elem].mode()[0], inplace=True)

In [93]:
for elem in df_test.columns:
    if df_test[elem].isnull().values.any():
        df_test[elem].fillna(df_test[elem].mode()[0], inplace=True)

In [94]:
#print (df_train["WoodDeckSF"])

In [95]:
df_train.head(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal,250000


In [96]:
#df_train["OpenPorchSF"]

In [97]:
mapper = DataFrameMapper([(['LotConfig'], pp.LabelBinarizer()),
                          (['MSSubClass'], pp.MinMaxScaler()),
                          (['MSZoning'], pp.LabelBinarizer()),
                          (['LotArea'], pp.MinMaxScaler()),
                          (['Street'], pp.LabelBinarizer()),
                          (['Alley'], pp.LabelBinarizer()),
                          (['LotShape'], pp.LabelBinarizer()),
                          (['LandContour'], pp.LabelBinarizer()),
                          (['LotConfig'], pp.LabelBinarizer()),
                          (['LandSlope'], pp.LabelBinarizer()),
                          (['Condition1'], pp.LabelBinarizer()),
                          (['Condition2'], pp.LabelBinarizer()),
                          (['BldgType'], pp.LabelBinarizer()),
                          (['HouseStyle'], pp.LabelBinarizer()),
                          (['OverallQual'], pp.MinMaxScaler()),
                          (['OverallCond'], pp.MinMaxScaler()),
                          (['YearBuilt'], pp.MinMaxScaler()),
                          (['YearRemodAdd'], pp.MinMaxScaler()),
                          (['RoofStyle'], pp.LabelBinarizer()),
                          (['RoofMatl'], pp.LabelBinarizer()),
                          (['Exterior1st'], pp.LabelBinarizer()),
                          (['Exterior2nd'], pp.LabelBinarizer()),
                          (['MasVnrType'], pp.LabelBinarizer()),
                          (['ExterQual'], pp.LabelBinarizer()),
                          (['ExterCond'], pp.LabelBinarizer()),
                          (['Foundation'], pp.LabelBinarizer()),
                          (['BsmtQual'], pp.LabelBinarizer()),
                          (['BsmtExposure'], pp.LabelBinarizer()),
                          (['BsmtFinType1'], pp.LabelBinarizer()),
                          (['BsmtFinSF1'], pp.MinMaxScaler()),
                          (['BsmtFinType2'], pp.LabelBinarizer()),
                          (['BsmtFinSF2'], pp.MinMaxScaler()),
                          (['BsmtUnfSF'], pp.MinMaxScaler()),
                          (['TotalBsmtSF'], pp.MinMaxScaler()),
                          (['Heating'], pp.LabelBinarizer()),
                          (['HeatingQC'], pp.LabelBinarizer()),
                          (['CentralAir'], pp.LabelBinarizer()),
                          (['Electrical'], pp.LabelBinarizer()),
                          (['1stFlrSF'], pp.MinMaxScaler()),
                          (['2ndFlrSF'], pp.MinMaxScaler()),
                          (['LowQualFinSF'], pp.MinMaxScaler()),
                          (['GrLivArea'], pp.MinMaxScaler()),
                          (['BsmtFullBath'], pp.MinMaxScaler()),
                          (['BsmtHalfBath'], pp.MinMaxScaler()),
                          (['FullBath'], pp.MinMaxScaler()),
                          (['HalfBath'], pp.MinMaxScaler()),
                          (['BedroomAbvGr'], pp.MinMaxScaler()),
                          (['KitchenAbvGr'], pp.MinMaxScaler()),
                          (['KitchenQual'], pp.LabelBinarizer()),
                          (['TotRmsAbvGrd'], pp.MinMaxScaler()),
                          (['Functional'], pp.LabelBinarizer()),
                          (['Fireplaces'], pp.MinMaxScaler()),
                          (['FireplaceQu'], pp.LabelBinarizer()),
                          (['GarageType'], pp.LabelBinarizer()),
                          (['GarageFinish'], pp.LabelBinarizer()),
                          (['GarageCars'], pp.MinMaxScaler()),
                          (['GarageArea'], pp.MinMaxScaler()),
                          (['GarageQual'], pp.LabelBinarizer()),
                          (['GarageCond'], pp.LabelBinarizer()),
                          (['PavedDrive'], pp.LabelBinarizer()),
                          (['WoodDeckSF'], pp.MinMaxScaler()),
                          (['OpenPorchSF'], pp.MinMaxScaler())])

In [98]:
data = mapper.fit_transform(df_train)



In [101]:
np.round(data,3)

array([[0.   , 0.   , 0.   , ..., 1.   , 0.   , 0.112],
       [0.   , 0.   , 1.   , ..., 1.   , 0.348, 0.   ],
       [0.   , 0.   , 0.   , ..., 1.   , 0.   , 0.077],
       ...,
       [0.   , 0.   , 0.   , ..., 1.   , 0.   , 0.11 ],
       [0.   , 0.   , 0.   , ..., 1.   , 0.427, 0.   ],
       [0.   , 0.   , 0.   , ..., 1.   , 0.859, 0.124]])

In [102]:
df_test.head(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,120,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,0,Ex,MnPrv,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,0,Ex,MnPrv,Shed,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,Grvl,IR1,HLS,AllPub,Inside,...,144,0,Ex,MnPrv,Shed,0,1,2010,WD,Normal


In [151]:
X_train, X_test, y_train, y_test = train_test_split(data, df_train['SalePrice'], test_size=0.2)

In [155]:
model = GradientBoostingRegressor()
# X, y = make_regression(noise=4, random_state=0)
# reg = LassoCV(cv=5, random_state=0).fit(X_train, y_train)
# model.fit(X_train, y_train)
gs = GridSearchCV(
                estimator=LassoCV(),
                param_grid={'eps':[10**-7, 10**-5, 10**-3],
                            'n_alphas':[25, 50, 75]},
                scoring='r2',
                cv=3,
                n_jobs=-1)
gs.fit(X_train, y_train)
result = gs.predict(X_test)
mean_absolute_error(result, y_test)



19792.306114401385

In [114]:
test_data = mapper.transform(df_test)

In [115]:
result = model.predict(test_data)

In [116]:
d = {'Id': range(1461,2920), 'SalePrice': result}
df = pd.DataFrame(data=d)
df.to_csv('result.csv')

In [117]:
df_train.head(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal,250000


In [118]:
enc = OrdinalEncoder()

In [119]:
enc.fit(df_train)

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)