In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
df_train = pd.read_csv('../input/wsrj-ykt2021-test/train.csv')
df_test = pd.read_csv('../input/wsrj-ykt2021-test/test.csv')

In [None]:
df_train.head()

In [None]:
df_train = df_train.drop(['DwellingId'], axis=1)

In [None]:
df_train.head()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
target = df_train['Price']

# Normalization

In [None]:
X_real = df_train[['Rooms', 'Area', 'Floor', 'Total_Floor', 'Rating']]

scaler = StandardScaler()
scaler.fit(X_real)
X_real_scaled = scaler.transform(X_real)

# TrainTestSplit

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_real_scaled, target, test_size=0.3)

# Model fitting

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

### RandomForestRegressor

In [None]:
n_estimators = range(100, 600, 50)
scores = []

for n_estimator in n_estimators:
    model = RandomForestRegressor(n_estimators=n_estimator, criterion='mae', random_state=1)
    model.fit(X_train, y_train)    
    s = mean_absolute_error(model.predict(X_test), y_test)
    scores.append(s)

In [None]:
plt.plot(n_estimators, scores)

In [None]:
best_n_estimators = 500

In [None]:
max_depths = range(4, 15)
scores = []

for max_depth in max_depths:
    model = RandomForestRegressor(
        n_estimators=best_n_estimators, 
        max_depth=max_depth, 
        criterion='mae',  
        random_state=1)
    
    model.fit(X_train, y_train)    
    s = mean_absolute_error(model.predict(X_test), y_test)
    scores.append(s)

In [None]:
plt.plot(max_depths, scores)

In [None]:
best_max_depth = 9

In [None]:
rfr_model = RandomForestRegressor(n_estimators=500, max_depth=9, criterion='mae')
rfr_model.fit(X_train, y_train)
mean_absolute_error(rfr_model.predict(X_test), y_test)

### XGBoostRegressor

In [None]:
n_estimators = range(100, 1000, 50)
scores = []

for n_estimator in n_estimators:
    model = XGBRegressor(n_estimators=n_estimator, eval_metric='mae', random_state=1)
    model.fit(X_train, y_train)    
    s = mean_absolute_error(model.predict(X_test), y_test)
    scores.append(s)

In [None]:
plt.plot(n_estimators, scores)

In [None]:
best_n_estimators = 100

In [None]:
max_depths = range(1, 50)
scores = []

for max_depth in max_depths:
    model = XGBRegressor(
        n_estimators=best_n_estimators, 
        max_depth=max_depth,
        eval_metric='mae', 
        random_state=1)
    
    model.fit(X_train, y_train)    
    s = mean_absolute_error(model.predict(X_test), y_test)
    scores.append(s)

In [None]:
plt.plot(max_depths, scores)

In [None]:
best_max_depth = max_depths[scores.index(min(scores))]

In [None]:
etas = np.linspace(0.001, 1, 200)
scores = []

for eta in etas:
    model = XGBRegressor(
        n_estimators=best_n_estimators, 
        max_depth=best_max_depth,
        eta=eta,
        eval_metric='mae', 
        random_state=1)
    
    model.fit(X_train, y_train)    
    s = mean_absolute_error(model.predict(X_test), y_test)
    scores.append(s)

In [None]:
plt.plot(etas, scores)

In [None]:
best_eta = etas[scores.index(min(scores))]
best_eta

In [None]:
xgb_model = XGBRegressor(
    n_estimators=best_n_estimators,
    max_depth=2,
    eta=best_eta,
    random_state=1
)
xgb_model.fit(X_train, y_train)

In [None]:
mean_absolute_error(
    xgb_model.predict(X_test),
    y_test
)

In [None]:
answers = rfr_model.predict(scaler.transform(df_test.drop(['DwellingId'], axis=1)))

In [None]:
ans = pd.DataFrame(columns=['DwellingId', 'Price'])
ans['DwellingId'] = df_test['DwellingId']
ans['Price'] = answers

In [None]:
ans.to_csv('answer.csv', index=False)

In [None]:
ans.head()