## 집 값 예측
- 예측할 변수 ['SalePrice']
- 평가: rmse, r2

    - rmse는 낮을 수록 좋은 성능
    - r2는 높을 수록 좋은 성능
   

In [None]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=2021)
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[id_name, target])
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[id_name, target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='SalePrice', id_name='Id')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Data Load & Simple EDA

In [None]:
import pandas as pd

In [None]:
X_train.shape, X_test.shape

In [None]:
pd.set_option("display.max_columns", 100)
display(X_train.head(3))
display(X_test.head(3))

In [None]:
y_train['SalePrice'].hist()

In [None]:
y_test['SalePrice'].hist()

In [None]:
X_train.isnull().sum().sort_values(ascending=False)[:20]

In [None]:
X_test.isnull().sum().sort_values(ascending=False)[:20]

In [None]:
X_train.info()

# Preprocessing

In [None]:
X_train = X_train.select_dtypes(exclude=['object'])
X_test = X_test.select_dtypes(exclude=['object'])
target = y_train['SalePrice']

In [None]:
X_train.head(3)

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer()
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

In [None]:
X_train

In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, target, test_size=0.15, random_state=2022)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

# Model

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor()
model.fit(X_tr, y_tr, verbose=False)
pred = model.predict(X_val)

print("R2 : " + str(r2_score(y_val, pred)))
print("RMSE : " + str(rmse(y_val, pred)))

In [None]:
model = RandomForestRegressor()
model.fit(X_tr, y_tr)
pred = model.predict(X_val)

print("R2 : " + str(r2_score(y_val, pred)))
print("RMSE : " + str(rmse(y_val, pred)))

In [None]:
y = y_train['SalePrice']

In [None]:
final_model = XGBRegressor()
final_model.fit(X_train, y)

prediction = final_model.predict(X_test)

## Prediction & to CSV

In [None]:
submission = pd.DataFrame(data={
    'Id': y_test.Id,
    'income' : prediction
})

In [None]:
submission.head()

In [None]:
submission.to_csv("12345.csv", index=False)

# 결과 체점

In [None]:
pred = model.predict(X_test)
print("RMSE : " + str(rmse(y_test['SalePrice'], prediction)))
print("R2 : " + str(r2_score(y_test['SalePrice'], prediction)))