In [16]:
import pandas as pd
import numpy as np
import os 

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import KFold

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.metrics import mean_squared_error

# 모든 컬럼을 보여주는 함수
from IPython.display import display
pd.options.display.max_columns = None

# 시드 고정 
np.random.seed(42)

print(os.listdir('./dataset/house_price/'))

['data_description.txt', 'sample_submission.csv', 'test.csv', 'train.csv']


# Load Data

In [3]:
train = pd.read_csv('./dataset/house_price/train.csv')
test = pd.read_csv('./dataset/house_price/test.csv')

print('train shape : ', train.shape)
print('test shape :', test.shape)

train shape :  (1460, 81)
test shape : (1459, 80)


# Information

In [4]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


# Preprocessing

In [5]:
label = train['SalePrice']
train = train.drop(['Id', 'SalePrice'], axis=1)
test = test.drop(['Id'], axis=1)

In [6]:
# 결측값이 10% 이상인 변수 제거 
def preprocessing(data):
    data_missing = pd.DataFrame(data.isnull().sum() / data.shape[0] * 100)
    data_missing.columns = ['missing_values']
    
    data_missing = data_missing.loc[data_missing['missing_values']!=0]
    data_missing = data_missing.sort_values('missing_values',  ascending=False)
    
    data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage'], axis = 1, inplace=True)
    
preprocessing(train)
preprocessing(test)

In [7]:
# 범주형, 수치형 변수 구분
def is_digit(str):
    try:
        tmp = float(str)
        return True
    except ValueError:
        return False
    
feature_list = train.columns

category_feature = []
continous_feature = []

for feature in feature_list:
    if is_digit(train[feature][0]) == True:
        continous_feature.append(feature)
    else :
        category_feature.append(feature)

In [8]:
# 변수 선택 class
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# Modeling

In [10]:
# 수치형 변수 파이프 라인
continous_model = Pipeline([
    ('selector', DataFrameSelector(continous_feature)),
    ('imputer', Imputer(strategy="median")),
    ('scaler', StandardScaler()),
])

# 범주형 변수 파이프 라인
category_model = Pipeline([
    ('selector', DataFrameSelector(category_feature)),
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(categories = 'auto'))
])

# 전체 파이프 라인
model_pipeline = FeatureUnion(transformer_list=[
        ("continous_model", continous_model),
        ("category_model", category_model)
])

train_prepared = model_pipeline.fit_transform(train)
train_prepared

test_prepared = model_pipeline.fit_transform(test)
test_prepared



<1459x270 sparse matrix of type '<class 'numpy.float64'>'
	with 106507 stored elements in Compressed Sparse Row format>

In [11]:
print('train_prepared : ', train_prepared.shape)
print('test_prepared : ', test_prepared.shape)

train_prepared :  (1460, 280)
test_prepared :  (1459, 270)


In [12]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_prepared, label)
pred = lin_reg.predict(train_prepared)

In [15]:
lin_mse = mean_squared_error(label, pred)
lin_rmse = np.sqrt(lin_mse)

lin_rmse

20868.54516193148