# XGBoost Regressor

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
train_data = pd.read_csv('https://raw.githubusercontent.com/Pankaj-Str/codeswithpankaj.com-Machine-Learning/main/Dataset/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/Pankaj-Str/codeswithpankaj.com-Machine-Learning/main/Dataset/house-prices-advanced-regression-techniques/test.csv')

# Exploratory Data Analysis (EDA)
print(train_data.head())
print(train_data.info())
print(train_data.describe())

# Data Preprocessing
# Separate target variable and features
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Identify categorical and numerical columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

# Preprocessing pipelines for numerical and categorical data
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ])

# Create the preprocessing and modeling pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for XGBoost Regressor
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5],
    'regressor__min_child_weight': [1, 2, 3],
    'regressor__gamma': [0, 0.1, 0.2],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best parameters: {best_params}")

# Predict on the validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Predict on the test set
test_pred = best_model.predict(test_data)

# Save the predictions
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_pred})
submission.to_csv('submission.csv', index=False)


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   



### Explanation

1. **Data Loading and EDA**:
    - Load the training and test datasets.
    - Perform basic EDA to understand the data structure.

2. **Data Preprocessing**:
    - Separate the target variable (`SalePrice`) from the features.
    - Identify numerical and categorical features.
    - Create preprocessing pipelines for numerical (imputation, scaling) and categorical (imputation, one-hot encoding) data.

3. **Pipeline Construction**:
    - Combine preprocessing steps and the XGBoost Regressor model into a single pipeline.

4. **Model Training and Hyperparameter Tuning**:
    - Split the data into training and validation sets.
    - Use `GridSearchCV` to find the best hyperparameters for the XGBoost Regressor model.
    - Train the model on the training set.

5. **Model Evaluation**:
    - Predict and evaluate the model performance on the validation set.
    - Make predictions on the test set and save the results.

