In [None]:
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
data = pd.read_csv("/kaggle/input/california-house-price-prediction/california_housing_test_1.csv")
df = pd.DataFrame(data)

# Extra NaN column drop
df.drop(['Unnamed: 9'], axis=1, inplace=True)

# Fill missing values in categorical column with mode
df['ocean_proximity'] = df['ocean_proximity'].fillna(df['ocean_proximity'].mode()[0])

# One-hot encode
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=False)

df = df.drop(['ocean_proximity_NEAR BAY'], axis = 1)

X = df.drop(['median_house_value'], axis = 1)
y = df['median_house_value']

#Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y, test_size = 0.3, random_state=42)

xgb = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8, 
    random_state=42
)

xgb.fit(X_train,y_train)

y_pred = xgb.predict(X_test)

print('XGB Score: ',round(r2_score(y_test, y_pred),2))