In [2]:
import pandas as pd
pd.set_option('display.max_columns', 500)

from pandas import Series
import numpy as np
import scipy as sp
import sklearn
import copy

import random as rnd
import time

import warnings
warnings.filterwarnings('ignore')
 
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix
import missingno as msno

#Configure Visualization Defaults
%matplotlib inline
mpl.style.use('seaborn')

In [3]:
df_train = pd.read_csv('C:/Users/lenovo/aiffel/data/kaggle_data/data/train.csv')
df_test = pd.read_csv('C:/Users/lenovo/aiffel/data/kaggle_data/data/test.csv')

In [4]:
df_train['YYYYMM'] = df_train.date.str[:6]
df_test['YYYYMM'] = df_test.date.str[:6]

In [5]:
df_train['YYYYMM'] = df_train['YYYYMM'].astype('int')
df_test['YYYYMM'] = df_test['YYYYMM'].astype('int')

In [6]:
df_zipsum = df_train[['zipcode','sqft_lot']].groupby('zipcode', as_index = False).sum()
df_zipcount = df_train[['zipcode','price']].groupby('zipcode', as_index=False).sum()
df_zip_avg = pd.merge(df_zipsum, df_zipcount, on='zipcode')
df_zip_avg['zip_avg_price'] = df_zip_avg['price'] / df_zip_avg['sqft_lot'] 

df_train = pd.merge(df_train, df_zip_avg[['zipcode','zip_avg_price']], on ='zipcode', how='left')
df_test =  pd.merge(df_test,  df_zip_avg[['zipcode','zip_avg_price']], on ='zipcode', how='left')

In [7]:
df_train['zipvalue'] =  df_train['zip_avg_price']*df_train['sqft_lot']
df_test['zipvalue'] =  df_test['zip_avg_price']*df_test['sqft_lot']

In [8]:
df_zip_mean = df_train[['zipcode','price']].groupby('zipcode', as_index=False).mean()
df_zip_mean.rename(columns={'price':'zip_avg_house_price'}, inplace=True)
df_train = pd.merge(df_train, df_zip_mean[['zipcode','zip_avg_house_price']], on ='zipcode', how='left')
df_test = pd.merge(df_test, df_zip_mean[['zipcode','zip_avg_house_price']], on ='zipcode', how='left')

In [9]:
df_train['totallot'] = df_train['sqft_above'] + df_train['sqft_basement']
df_test['totallot'] = df_test['sqft_above'] + df_test['sqft_basement']

In [15]:
df_train['price'] = np.log1p(df_train['price'])

In [17]:
drop_column = ['id','date','zipcode', 'sqft_lot', 'sqft_living', 'sqft_basement'] #

df_train.drop(drop_column, axis=1, inplace = True)
df_test.drop(drop_column, axis=1, inplace = True)

In [18]:
X_train_data = df_train.drop('price', axis=1).values
X_train_label = df_train['price'].values
X_test_data = df_test.values

alg = XGBRegressor(
                     base_score=0.5,              
                     booster='gbtree',            
                     colsample_bytree=1,          
                     importance_type='gain',      
                     max_depth=3,                 
                     min_child_weight=1,          
                     n_estimators=10000,          
                     n_jobs=1,                   
                     scale_pos_weight=1,         
                     silent=True,               
                     gamma=0,                    
                     random_state=0,              
                     reg_alpha=0,                
                     reg_lambda=1,                   
                     subsample=1,                     
                     learning_rate=0.1           
                  )


test_acc = []
xgb_prediction = 0 
splits_cnt = 5
kf = KFold(n_splits=splits_cnt ,random_state=0 ,shuffle=True)
for fold_no,(train_index, test_index) in enumerate(kf.split(df_train)):
    
    train_data, test_data = X_train_data[train_index],  X_train_data[test_index]
    train_label, test_label = X_train_label[train_index], X_train_label[test_index]
    

    alg.fit(
              train_data, train_label
            , eval_set = [(train_data,train_label),(test_data, test_label)] 
            , eval_metric = 'rmse'        
            , verbose=0                   
            , early_stopping_rounds=500  
           )
    
    
    predict = alg.predict(test_data)
    predict_value = np.sqrt(mean_squared_error(np.exp(test_label),np.exp(predict))).round().astype(int)
    test_acc.append(predict_value)
    print(predict_value)
    
   
    predictions = alg.predict(X_test_data)
    xgb_prediction += np.exp(predictions)
    
print(sum(test_acc)/len(test_acc))   
xgb_final = xgb_prediction / splits_cnt    

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


107567
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


123335
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


109982
Parameters: { "silent" } might not be us

In [19]:
submission = pd.read_csv('C:/Users/lenovo/aiffel/data/kaggle_data/data/sample_submission.csv')

In [20]:
submission['price'] = xgb_final

In [22]:
submission.to_csv('C:/Users/lenovo/aiffel/data/kaggle_data/data/xgb_final_to_verify.csv', index=False)

직접 여러번 코드를 돌려본 결과, 단순히 모델링이 좋다고 해서 점수가 11만점 밑으로 내려갈 것 같지는 않았다.   
실제로 그리드 서치로 엄청나게 돌려봤지만, 11만점 밑으로는 계속 나오지 않았다.   
그래서 코드를 참조해서 피쳐엔지니어링을 조금 해보는게 좋아보였다.   
아래 링크는 참조한 코드 주소이며, 참조한 이후에 돌려보니 11만점 밑으로 나오게 되었다.   
확실히 모델링 만으로는 한계가 있으며, 그 전에 피쳐 엔지니어링등 전처리 기법들이 중요하다는 것을 다시한번 느꼈다.
https://www.kaggle.com/code/hongodd/kakr-2nd-house-price-prediction