In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')

In [None]:
train.shape, test.shape

In [None]:
data = pd.concat([train, test])

In [None]:
data = data.reset_index(drop=True)

In [None]:
data

In [None]:
def missing_data(df):
    total = df.isnull().sum()
    percent = round(df.isnull().sum()/len(df)*100, 2)
    
    table = pd.concat([total, percent], axis=1, keys=['total', 'percent']).sort_values(by='total', ascending=False)
    return table

missing_data(data).head(30)

In [None]:
data['BsmtFullBath'].unique()

# 순서형 변수(결측값이 많은) ==> 다른 인코딩 방법

In [None]:
data['PoolQC'] = data['PoolQC'].fillna('NaN')  # pool 품질
data['Fence'] = data['Fence'].fillna('NaN')  # 펜스 품질
data['FireplaceQu'] = data['FireplaceQu'].fillna('NaN')  # 벽난로 품질

# 명목형 변수(결측값이 많은) ==> 레이블 인코딩

In [None]:
data['MiscFeature'] = data['MiscFeature'].fillna('NaN')  # 기타 범주에서 다루지 않는 기타 기능
data['Alley'] = data['Alley'].fillna('NaN')  # 골목길 포장 유형(?)

# 'Garage' 관련 변수 처리

In [None]:
garage_col = [col for col in data.columns if col.find('Garage') != -1]

for i in garage_col:
    if data[i].dtype == 'object':
        data[i] = data[i].fillna('NaN')
    else:
        data[i] = data[i].fillna(0)

# Exterior 변수 처리

In [None]:
data.loc[(data['Exterior1st'].isnull())&(data['Exterior2nd'].isnull())].iloc[:,30:30]

In [None]:
data['Exterior1st'] = data['Exterior1st'].fillna('NaN')

# BsmtFinType 변수 처리

In [None]:
data.loc[(data['BsmtFinType2'].isnull())&(data['BsmtFinType1'].notnull()), 'BsmtFinType2'] = 'Rec'

In [None]:
data['BsmtFinType1'] = data['BsmtFinType1'].fillna('NaN')
data['BsmtFinType2'] = data['BsmtFinType2'].fillna('NaN')

# 피쳐엔지니어링 안하고 catboost 돌리는 버전

In [None]:
data['LotFrontage'] = data['LotFrontage'].fillna(data['LotFrontage'].median())

In [None]:
for i in data.select_dtypes('float').columns:
    if i == 'SalePrice':
        continue
    data[i] = data[i].fillna(0.)

In [None]:
x_train = data.loc[data['SalePrice'].notnull()].drop(columns=['Id', 'SalePrice'])
y_train = data.loc[data['SalePrice'].notnull()]['SalePrice']
x_test = data.loc[data['SalePrice'].isnull()].drop(columns=['Id', 'SalePrice'])

x_train.shape, y_train.shape, x_test.shape

In [None]:
import catboost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)
cat_features = x_train.select_dtypes('object').columns
preds_cat = np.zeros((len(x_test)))

for i, (train_index, valid_index) in enumerate(skf.split(x_train, y_train)):
    X_train, X_valid = x_train.iloc[train_index], x_train.iloc[valid_index]
    Y_train, Y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    model=catboost.CatBoostRegressor(n_estimators=3000, eval_metric='RMSE')
    
    model.fit(X_train, Y_train, cat_features=cat_features, eval_set=[(X_train, Y_train), (X_valid, Y_valid)], verbose=100, early_stopping_rounds=50)
    
    preds_cat = model.predict(x_test)

In [None]:
submission['SalePrice'] = preds_cat
submission.to_csv('submission.csv', index=False)

# 그냥 카테고리 변수들은 'NaN'

In [None]:
for i in data.select_dtypes('object').columns:
    data[i] = data[i].fillna('NaN')