In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [40]:
import sklearn.preprocessing as pp
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor

In [41]:
def preprocessing(df_train, df_test):
    '''
    Parameters
    ----------
    df_train : pandas Dataframe
    df_test : pandas Dataframe
    
    Return
    ----------
    df_train : pandas Dataframe
    df_test : pandas Dataframe
    y_train : pandas Series
    '''
    # remove outliers in GrLivArea
    df_train.drop(df_train[df_train['GrLivArea'] > 4500].index, inplace=True)   

    # Normalize SalePrice using log_transform
    y_train = np.log1p(df_train['SalePrice'])
    # Remove SalePrice from training and merge training and test data
    df_train.pop('SalePrice')
    dataset = pd.concat([df_train, df_test])

    # Numerical variable with "categorical meaning"
    # Cast it to str so that we get dummies later on
    dataset['MSSubClass'] = dataset['MSSubClass'].astype(str)

    
    ### filling NaNs ###
    # no alley
    dataset["Alley"].fillna("None", inplace=True)

    # no basement
    dataset["BsmtCond"].fillna("None", inplace=True)
    dataset["BsmtExposure"].fillna("None", inplace=True)
    dataset["BsmtFinSF1"].fillna(0, inplace=True)               
    dataset["BsmtFinSF2"].fillna(0, inplace=True)               
    dataset["BsmtUnfSF"].fillna(0, inplace=True)                
    dataset["TotalBsmtSF"].fillna(0, inplace=True)
    dataset["BsmtFinType1"].fillna("None", inplace=True)
    dataset["BsmtFinType2"].fillna("None", inplace=True)
    dataset["BsmtFullBath"].fillna(0, inplace=True)
    dataset["BsmtHalfBath"].fillna(0, inplace=True)
    dataset["BsmtQual"].fillna("None", inplace=True)

    # most common electrical system
    dataset["Electrical"].fillna("SBrkr", inplace=True)

    # one missing in test; set to other
    dataset["Exterior1st"].fillna("Other", inplace=True)
    dataset["Exterior2nd"].fillna("Other", inplace=True)

    # no fence
    dataset["Fence"].fillna("None", inplace=True)

    # no fireplace
    dataset["FireplaceQu"].fillna("None", inplace=True)

    # fill with typical functionality
    dataset["Functional"].fillna("Typ", inplace=True)

    # no garage
    dataset["GarageArea"].fillna(0, inplace=True)
    dataset["GarageCars"].fillna(0, inplace=True)
    dataset["GarageCond"].fillna("None", inplace=True)
    dataset["GarageFinish"].fillna("None", inplace=True)
    dataset["GarageQual"].fillna("None", inplace=True)
    dataset["GarageType"].fillna("None", inplace=True)
    dataset["GarageYrBlt"].fillna("None", inplace=True)

    # "typical" kitchen
    dataset["KitchenQual"].fillna("TA", inplace=True)

    # lot frontage (no explanation for NA values, perhaps no frontage)
    dataset["LotFrontage"].fillna(0, inplace=True)

    # Masonry veneer (no explanation for NA values, perhaps no masonry veneer)
    dataset["MasVnrArea"].fillna(0, inplace=True)
    dataset["MasVnrType"].fillna("None", inplace=True)

    # most common value
    dataset["MSZoning"].fillna("RL", inplace=True)

    # no misc features
    dataset["MiscFeature"].fillna("None", inplace=True)

    # description says NA = no pool, but there are entries with PoolArea >0 and PoolQC = NA. Fill the ones with values with average condition
    dataset.loc[(dataset['PoolQC'].isnull()) & (dataset['PoolArea']==0), 'PoolQC' ] = 'None'
    dataset.loc[(dataset['PoolQC'].isnull()) & (dataset['PoolArea']>0), 'PoolQC' ] = 'TA'

    # classify missing SaleType as other
    dataset["SaleType"].fillna("WD", inplace=True)

    # most common
    dataset["Utilities"].fillna("AllPub", inplace=True)

    
    ### feature engineering ###
    # create new binary variables: assign 1 to mode
#     dataset["IsRegularLotShape"] = (dataset["LotShape"] == "Reg") * 1
#     dataset["IsLandLevel"] = (dataset["LandContour"] == "Lvl") * 1
#     dataset["IsLandSlopeGentle"] = (dataset["LandSlope"] == "Gtl") * 1
#     dataset["IsElectricalSBrkr"] = (dataset["Electrical"] == "SBrkr") * 1
#     dataset["IsGarageDetached"] = (dataset["GarageType"] == "Detchd") * 1
#     dataset["IsPavedDrive"] = (dataset["PavedDrive"] == "Y") * 1
#     dataset["HasShed"] = (dataset["MiscFeature"] == "Shed") * 1
    # was the house remodeled? if yes, assign 1
#     dataset["Remodeled"] = (dataset["YearRemodAdd"] != dataset["YearBuilt"]) * 1
    # assign 1 to houses which were sold the same year they were remodeled
#     dataset["RecentRemodel"] = (dataset["YearRemodAdd"] == dataset["YrSold"]) * 1
    # assign 1 to houses which were sold the same year they were built
#     dataset["VeryNewHouse"] = (dataset["YearBuilt"] == dataset["YrSold"]) * 1

    
    ### normalization ###
    # normalize distribution for continuous variables with skew > 3
    continuous_vars = ['1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'EnclosedPorch',\
                'GarageArea', 'GrLivArea', 'LotArea', 'LotFrontage', 'MasVnrArea', 'MiscVal',\
                'OpenPorchSF', 'PoolArea', 'ScreenPorch', 'TotalBsmtSF', 'WoodDeckSF']
    skew_threshold = 3
    for entry in continuous_vars:
        if dataset[entry].skew() > skew_threshold:
            dataset[entry] = np.log1p(dataset[entry])
    
    
    ### standardization ###
    # standardization for continuous variables
    sub_df = dataset[continuous_vars]
    array_standard = StandardScaler().fit_transform(sub_df)
    df_standard = pd.DataFrame(array_standard, dataset.index, continuous_vars)
    dataset.drop(dataset[continuous_vars], axis=1, inplace=True)
    dataset = pd.concat([dataset, df_standard], axis=1)
    
    
    ### dummies ###
    # split back to training and test set
    df_train_len = len(df_train)
    df_dummies =  pd.get_dummies(dataset)
    df_train = df_dummies[:df_train_len]
    df_test = df_dummies[df_train_len:]

    return df_train, df_test, y_train

In [42]:
df_train = pd.read_csv('/home/voshkanov/house-prices-datasets/train.csv', index_col='Id')



In [43]:
df_test = pd.read_csv('/home/voshkanov/house-prices-datasets/test.csv', index_col='Id')

In [44]:
df_train["SaleType"].value_counts

<bound method IndexOpsMixin.value_counts of Id
1        WD
2        WD
3        WD
4        WD
5        WD
6        WD
7        WD
8        WD
9        WD
10       WD
11       WD
12      New
13       WD
14      New
15       WD
16       WD
17       WD
18       WD
19       WD
20      COD
21      New
22       WD
23       WD
24       WD
25       WD
26       WD
27       WD
28       WD
29       WD
30       WD
       ... 
1431     WD
1432     WD
1433     WD
1434     WD
1435     WD
1436    COD
1437     WD
1438    New
1439     WD
1440     WD
1441     WD
1442     WD
1443     WD
1444     WD
1445     WD
1446     WD
1447     WD
1448     WD
1449     WD
1450     WD
1451     WD
1452    New
1453     WD
1454     WD
1455     WD
1456     WD
1457     WD
1458     WD
1459     WD
1460     WD
Name: SaleType, Length: 1460, dtype: object>

In [45]:
df_train, df_test, y_train = preprocessing(df_train, df_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [46]:

mapper = DataFrameMapper([(['LotConfig'], pp.LabelBinarizer()),
                          (['MSSubClass'], pp.MinMaxScaler()),
                          (['LotFrontage'], pp.MinMaxScaler()),
                          (['MSZoning'], pp.LabelBinarizer()),
                          (['LotArea'], pp.MinMaxScaler()),
                          (['Street'], pp.LabelBinarizer()),
                          (['Utilities'], pp.LabelBinarizer()),
                          (['Alley'], pp.LabelBinarizer()),
                          (['LotShape'], pp.LabelBinarizer()),
                          (['LandContour'], pp.LabelBinarizer()),
                          (['Neighborhood'], pp.OrdinalEncoder()),
                          (['LotConfig'], pp.LabelBinarizer()),
                          (['LandSlope'], pp.LabelBinarizer()),
                          (['Condition1'], pp.LabelBinarizer()),
                          (['Condition2'], pp.LabelBinarizer()),
                          (['BldgType'], pp.LabelBinarizer()),
                          (['HouseStyle'], pp.LabelBinarizer()),
                          (['OverallQual'], pp.StandardScaler()),
                          (['OverallCond'], pp.MinMaxScaler()),
                          (['YearBuilt'], pp.MinMaxScaler()),
                          (['YearRemodAdd'], pp.MinMaxScaler()),
                          (['RoofStyle'], pp.LabelBinarizer()),
                          (['RoofMatl'], pp.LabelBinarizer()),
                          (['Exterior1st'], pp.OrdinalEncoder()),
                          (['Exterior2nd'], pp.OrdinalEncoder()),
                          (['MasVnrArea'], pp.MinMaxScaler()),
                          (['MasVnrType'], pp.LabelBinarizer()),
                          (['ExterQual'], pp.LabelBinarizer()),
                          (['ExterCond'], pp.LabelBinarizer()),
                          (['Foundation'], pp.LabelBinarizer()),
                          (['BsmtQual'], pp.LabelBinarizer()),
                          (['BsmtCond'], pp.LabelBinarizer()),
                          (['BsmtExposure'], pp.LabelBinarizer()),
                          (['BsmtFinType1'], pp.LabelBinarizer()),
                          (['BsmtFinSF1'], pp.MinMaxScaler()),
                          (['BsmtFinType2'], pp.LabelBinarizer()),
                          (['BsmtFinSF2'], pp.MinMaxScaler()),
                          (['BsmtUnfSF'], pp.MinMaxScaler()),
                          (['TotalBsmtSF'], pp.StandardScaler()),
                          (['Heating'], pp.LabelBinarizer()),
                          (['HeatingQC'], pp.LabelBinarizer()),
                          (['CentralAir'], pp.LabelBinarizer()),
                          (['Electrical'], pp.LabelBinarizer()),
                          (['1stFlrSF'], pp.StandardScaler()),
                          (['2ndFlrSF'], pp.MinMaxScaler()),
                          (['LowQualFinSF'], pp.MinMaxScaler()),
                          (['GrLivArea'], pp.StandardScaler()),
                          (['BsmtFullBath'], pp.MinMaxScaler()),
                          (['BsmtHalfBath'], pp.MinMaxScaler()),
                          (['FullBath'], pp.MinMaxScaler()),
                          (['HalfBath'], pp.MinMaxScaler()),
                          (['BedroomAbvGr'], pp.MinMaxScaler()),
                          (['KitchenAbvGr'], pp.MinMaxScaler()),
                          (['KitchenQual'], pp.LabelBinarizer()),
                          (['TotRmsAbvGrd'], pp.MinMaxScaler()),
                          (['Functional'], pp.LabelBinarizer()),
                          (['Fireplaces'], pp.MinMaxScaler()),
                          (['FireplaceQu'], pp.LabelBinarizer()),
                          (['GarageType'], pp.LabelBinarizer()),
                          (['GarageYrBlt'], pp.MinMaxScaler()),
                          (['GarageFinish'], pp.LabelBinarizer()),
                          (['GarageCars'], pp.MinMaxScaler()),
                          (['GarageArea'], pp.StandardScaler()),
                          (['GarageQual'], pp.LabelBinarizer()),
                          (['GarageCond'], pp.LabelBinarizer()),
                          (['PavedDrive'], pp.LabelBinarizer()),
                          (['WoodDeckSF'], pp.MinMaxScaler()),
                          (['OpenPorchSF'], pp.MinMaxScaler()),
                          (['EnclosedPorch'], pp.MinMaxScaler()),
                          (['3SsnPorch'], pp.MinMaxScaler()),
                          (['ScreenPorch'], pp.MinMaxScaler()),
                          (['PoolArea'], pp.MinMaxScaler()),
                          (['PoolQC'], pp.LabelBinarizer()),
                          (['Fence'], pp.LabelBinarizer()),
                          (['MiscFeature'], pp.LabelBinarizer()),
                          (['MiscVal'], pp.MinMaxScaler()),
                          (['MoSold'], pp.MinMaxScaler()),
                          (['YrSold'], pp.MinMaxScaler()),
                          (['SaleType'], pp.LabelBinarizer()),
                          (['SaleCondition'], pp.LabelBinarizer())])

In [47]:
#type(mapper)

In [48]:
#data = mapper.fit_transform(df_train)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y_train, test_size=0.1)

In [50]:

model = LassoCV(eps=10**-7, n_alphas=75)
model.fit(X_train, y_train)
result = model.predict(X_test)

mean_absolute_error(np.expm1(result), np.expm1(y_test))





14047.964865362665

In [36]:
#test_data = mapper.transform(df_test)

In [37]:
result

array([11.98976006, 12.07139589, 12.44466268, 12.02886759, 11.4148108 ,
       11.59907844, 11.57922795, 11.85731117, 12.39863497, 12.03316082,
       12.65034119, 12.22069645, 11.97514158, 11.6527397 , 11.54366217,
       11.64905706, 11.995924  , 11.90949666, 12.34606891, 13.49440151,
       12.03102659, 11.72289228, 11.34400545, 11.71030595, 12.75363082,
       12.28364988, 12.05252577, 11.94937929, 11.65385206, 12.30679039,
       11.99753528, 12.34324024, 12.38127777, 11.83412295, 11.8348353 ,
       11.52847727, 12.31781581, 12.33039833, 12.4810971 , 12.0053547 ,
       12.54818474, 12.35789385, 12.33341977, 12.22715106, 11.31691599,
       11.93464791, 11.66880756, 12.21826317, 11.63383609, 11.78105916,
       11.29244964, 12.40283706, 12.57816988, 11.85532445, 12.34727073,
       12.28499675, 12.40721011, 12.07748688, 11.80828592, 11.94780815,
       11.82366422, 12.40781534, 11.42230052, 12.06646209, 12.08822316,
       11.6118051 , 11.7597301 , 11.88415288, 12.19567544, 11.61

In [15]:
submission = pd.DataFrame({"Id": df_test.index.values, "SalePrice": np.expm1(result)})
submission.to_csv('result.csv', index=False)


ValueError: arrays must all be same length

In [120]:
enc = OrdinalEncoder()

In [121]:
enc.fit(df_train)

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)