In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [4]:
cleanup_store = {} # can be reused for test set
def data_cleanup(df):
    # drop columns with lot of missing values
    # cols with less than 30% non null values
    cols = df.columns[(df.count() / df.shape[0]) < 0.3]
    df.drop(columns=cols, inplace=True)
    
    # print(df[['LotFrontage']].describe())
    # we can impute LotFrontage with mean value
    df['LotFrontage'] = df['LotFrontage'].apply(lambda x: df['LotFrontage'].mean() if np.isnan(x) else x)
    if 'LotFrontage' not in cleanup_store:
        cleanup_store['LotFrontage'] = df['LotFrontage'].mean()
    
    # GarageYrBlt should be same as year built
    # sns.lineplot(x=df['YearBuilt'], y=df['GarageYrBlt'])
    # plt.show()
    # turns out before 1960 yearbuilt lot of garages were built later
    # better to drop the rows with Garage year built missing
    
    # drop missing values
#     cols = ['GarageYrBlt', 'BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']
#     for c in cols:
#         df.drop(df[df[c].isnull()].index, axis=0, inplace=True)
    
    # print(df['MasVnrType'].describe())
    # print(df['Electrical'].describe())
    # print(df['MasVnrArea'].describe())
    # impute with mode values
    cols = ['MasVnrType', 'Electrical', 'MasVnrArea', 'GarageYrBlt', 
            'BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
    for c in cols:
        df[c].fillna(df[c].mode()[0], inplace=True)
        if c not in cleanup_store:
            cleanup_store[c] = df[c].mode()[0]
    
    # TODO: impute FireplaceQu with iterative imputer
    df.drop(columns=['FireplaceQu'], inplace=True)
    
    return df

df_clean = data_cleanup(df.copy())

cols = list(df_clean.columns[(df_clean.isnull().sum() / df_clean.shape[0]) > 0])
print('Remaining {} out of {}'.format(df_clean.shape[0], df.shape[0]))
(df_clean[cols].count() / df_clean.shape[0]).sort_values()

Remaining 1460 out of 1460


Series([], dtype: float64)

In [5]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non

In [6]:
from sklearn.preprocessing import StandardScaler

def data_transform(df):
    # print(df.select_dtypes(include="object").columns)
    categorical_cols = list(df.select_dtypes(include="object").columns)
    for c in categorical_cols:
        dummy_cols = pd.get_dummies(df[c], drop_first=True, prefix=c)
        df = pd.concat([df, dummy_cols], axis=1)
        df.drop(columns=[c], inplace=True)
        
    df.drop(columns=['Id'], inplace=True)

    return df

df_transformed = data_transform(df_clean.copy())
print(df_transformed.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 233 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(3), int64(34), uint8(196)
memory usage: 701.6 KB
None


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

In [8]:
y = df_transformed[['SalePrice']]
X = df_transformed.drop(columns=['SalePrice'])

xscaler = StandardScaler()
yscaler = StandardScaler()

def data_scale(X, y):
    xcols = X.columns
    X = pd.DataFrame(xscaler.fit_transform(X), columns=xcols)
    y = pd.DataFrame(yscaler.fit_transform(y))
    return X, y

X, y = data_scale(X, y)
print(X.shape)
print(y.shape)

(1460, 232)
(1460, 1)


In [9]:
param = {'alpha': [0.0001, 0.001, 0.005, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.9, 1]}
model = GridSearchCV(estimator=Lasso(),
                     param_grid=param,
                     cv=5,
                     scoring='r2')
model.fit(X, y)

  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.0001, 0.001, 0.005, 0.01, 0.1, 0.2, 0.3,
                                   0.4, 0.5, 0.6, 0.9, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [10]:
print(model.best_score_)

0.8358612840493607


In [11]:
from sklearn.metrics import mean_squared_error
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X), y))}')

RMSE: 0.3322086200713321


In [12]:
print(model.best_params_)

{'alpha': 0.01}


In [13]:
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
print(test_df.shape)
test_df.head()

(1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [14]:
cleaned_test_df = data_cleanup(test_df.copy())
cleaned_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 75 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1459 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-null object
Exterior2nd      1458 non

In [15]:
# Addtional cleanup on test data
# fill all with mode values
cols = cleaned_test_df.columns[cleaned_test_df.isnull().sum()>0]
for c in cols:
    cleaned_test_df[c].fillna(df[c].mode()[0], inplace=True)
cleaned_test_df.columns[cleaned_test_df.isnull().sum()>0]

Index([], dtype='object')

In [16]:
transformed_test_df = data_transform(cleaned_test_df.copy())
transformed_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 216 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(11), int64(25), uint8(180)
memory usage: 666.9 KB


In [17]:
def col_normalisation(train_df, test_df):
    train_cols = train_df.columns
    test_cols = test_df.columns
    print(f'Before: train cols {train_df.shape[1]}, test cols {test_df.shape[1]}')

    # print([c for c in train_cols if c not in test_cols])
    missing_cols = [c for c in train_cols if c not in test_cols]
    for c in missing_cols:
        test_df[c] = 0
    print(f'1: train cols {train_df.shape[1]}, test cols {test_df.shape[1]}')
    
    # print([c for c in validation_cols if c not in train_cols])
    extra_cols = [c for c in test_cols if c not in train_cols]
    for c in extra_cols:
        test_df.drop(columns=c, inplace=True)
    print(f'2: train cols {train_df.shape[1]}, test cols {test_df.shape[1]}')
    
    return test_df

X_test = col_normalisation(X, transformed_test_df.copy())

Before: train cols 232, test cols 216
1: train cols 232, test cols 232
2: train cols 232, test cols 232


In [18]:
y_pred = model.predict(X_test)
y_pred[0:10]

array([1024.54840746, 1295.38220069, 1356.62767279, 1185.39483966,
        856.65189071, 1177.67198507,  988.09829208, 1048.27087116,
       1114.54051306,  894.73100598])

In [19]:
y_pred_inv = yscaler.inverse_transform(y_pred)
y_pred_inv[0:10]

array([8.15457320e+07, 1.03054077e+08, 1.07917904e+08, 9.43193984e+07,
       6.82121811e+07, 9.37060856e+07, 7.86510355e+07, 8.34296584e+07,
       8.86924814e+07, 7.12362452e+07])

In [20]:
submission_df = test_df[['Id']]
submission_df['SalePrice'] = y_pred_inv
print(submission_df.shape)
submission_df.head()

(1459, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Id,SalePrice
0,1461,81545730.0
1,1462,103054100.0
2,1463,107917900.0
3,1464,94319400.0
4,1465,68212180.0


In [21]:
submission_df.to_csv('submission.csv', index=False)