In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [3]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [6]:
null_cols = (train.isnull().sum() / len(train)).sort_values(ascending=False)[:20]
null_cols

PoolQC          0.995205
MiscFeature     0.963014
Alley           0.937671
Fence           0.807534
FireplaceQu     0.472603
LotFrontage     0.177397
GarageYrBlt     0.055479
GarageCond      0.055479
GarageType      0.055479
GarageFinish    0.055479
GarageQual      0.055479
BsmtFinType2    0.026027
BsmtExposure    0.026027
BsmtQual        0.025342
BsmtCond        0.025342
BsmtFinType1    0.025342
MasVnrArea      0.005479
MasVnrType      0.005479
Electrical      0.000685
Id              0.000000
dtype: float64

In [7]:
null_cols = null_cols[null_cols > 0.1].index
null_cols

Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
       'LotFrontage'],
      dtype='object')

In [8]:
null_cols = list(null_cols)
null_cols

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage']

In [9]:
null_cols.append('Id')
null_cols

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage', 'Id']

In [10]:
train_processed = train.drop(null_cols, axis = 1)
test_processed = test.drop(null_cols, axis = 1)

In [11]:
print(train_processed.shape)
print(test_processed.shape)

(1460, 74)
(1459, 73)


In [12]:
correlation = pd.get_dummies(train_processed, drop_first=True).corr()['SalePrice']
correlation = abs(correlation)
correlation

MSSubClass               0.084284
LotArea                  0.263843
OverallQual              0.790982
OverallCond              0.077856
YearBuilt                0.522897
                           ...   
SaleCondition_AdjLand    0.050686
SaleCondition_Alloca     0.015525
SaleCondition_Family     0.046480
SaleCondition_Normal     0.153990
SaleCondition_Partial    0.352060
Name: SalePrice, Length: 232, dtype: float64

In [13]:
low_corr = correlation[correlation < 0.2].index
low_corr

Index(['MSSubClass', 'OverallCond', 'BsmtFinSF2', 'LowQualFinSF',
       'BsmtHalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'EnclosedPorch',
       '3SsnPorch', 'ScreenPorch',
       ...
       'SaleType_CWD', 'SaleType_Con', 'SaleType_ConLD', 'SaleType_ConLI',
       'SaleType_ConLw', 'SaleType_Oth', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal'],
      dtype='object', length=173)

In [14]:
pd.get_dummies(train_processed, drop_first = True).shape

(1460, 232)

In [15]:
train_processed = pd.get_dummies(train_processed, drop_first=True)
test_processed = pd.get_dummies(test_processed, drop_first=True)

In [16]:
print(train_processed.shape)
print(test_processed.shape)

(1460, 232)
(1459, 215)


In [17]:
missing_cols = set(train_processed.columns) - set(test_processed.columns)
missing_cols

{'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'Electrical_Mix',
 'Exterior1st_ImStucc',
 'Exterior1st_Stone',
 'Exterior2nd_Other',
 'GarageQual_Fa',
 'Heating_GasA',
 'Heating_OthW',
 'HouseStyle_2.5Fin',
 'RoofMatl_CompShg',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'SalePrice',
 'Utilities_NoSeWa'}

In [18]:
for col in missing_cols:
    test_processed[col] = 0

In [19]:
missing_cols = set(train_processed.columns) - set(test_processed.columns)
missing_cols

set()

In [20]:
for col in missing_cols:
    test_processed[col] = 0

In [21]:
print(train_processed.shape)
print(test_processed.shape)

(1460, 232)
(1459, 232)


In [22]:
train_processed.drop(low_corr, axis=1, inplace=True)
test_processed.drop(low_corr, axis=1, inplace=True)

In [23]:
print(train_processed.shape)
print(test_processed.shape)

(1460, 59)
(1459, 59)


In [27]:
X = train_processed.drop('SalePrice', axis=1)
Y = train_processed.SalePrice

In [29]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [32]:
imputer = SimpleImputer()
scaler = StandardScaler()

In [33]:
preprocess = Pipeline([('imputer', imputer), ('scaler', scaler)])

In [34]:
lr = LinearRegression()

In [35]:
pipeline = Pipeline([('preprocess', preprocess), ('lr', lr)])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)

In [37]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('imputer', SimpleImputer()),
                                 ('scaler', StandardScaler())])),
                ('lr', LinearRegression())])

In [38]:
pipeline['lr'].coef_

array([  4194.64693847,  18376.65087811,   1818.05786796,   6832.87905773,
         5571.38068343,    -66.38576058,  -2702.79454904,   6655.87280044,
        29669.59368587,  40536.34139442, -20904.82694993,   1840.39502972,
         1819.32126315,    608.58613958,   -102.2652063 ,   2882.551883  ,
        -1890.3712078 ,   9665.35466584,  -2400.49552223,   2704.39273184,
         1509.16220684,  -1694.36255294,  -6150.6411468 ,  -1616.0182625 ,
         7555.99448559,   5582.83744491,   5264.1111791 ,  -6849.60561409,
         -403.52336139,   1262.70357168,  -5823.55126688,   6105.7541173 ,
         4724.32277596,    828.8771313 ,  -5354.18588617,  -7480.70659069,
         4303.47014036,   4090.96411016,  -8617.63924212,  -5580.9344045 ,
         5493.58746771,  -1857.82242589,   3659.74894066,  -2028.21806833,
         2146.8619257 ,   -218.91559251, -12133.4730754 , -11890.84593572,
         4164.58140807,   2343.30382357,   2574.33898575,   -567.80309033,
        -1378.7909055 ,  

In [39]:
pipeline.score(X_test, y_test)

0.7741262545164016

In [40]:
test_processed.drop('SalePrice', axis=1, inplace=True)
pred = pipeline.predict(test_processed)

In [41]:
submission = pd.DataFrame({
    'Id': test.Id,
    'SalePrice': pred
})
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,108080.797497
1,1462,151186.023836
2,1463,178059.004581
3,1464,187042.662437
4,1465,226950.114199


In [42]:
submission.to_csv('output.csv', index=False)