<a href="https://colab.research.google.com/github/Vivek5920003/AIML-PROJECTS/blob/main/Kaggle_house_price_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from math import sqrt
import xgboost as xgb


In [2]:
# Load train and test data
train_data = pd.read_csv('/content/train.csv')  # Training data with target 'SalePrice'
test_data = pd.read_csv('/content/test.csv')    # Test data without 'SalePrice'

In [3]:
# Assuming 'SalePrice' is the target variable
X_train = train_data.drop('SalePrice', axis=1)  # Features in train data
y_train = train_data['SalePrice']                # Target variable in train data

In [4]:
# Preprocessing
X_train.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

In [5]:
# Convert all column names to strings
X_train.columns = X_train.columns.astype(str)
test_data.columns = test_data.columns.astype(str)

In [6]:
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

In [7]:
# Convert all columns in X_train[categorical_cols] to strings
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
test_data[categorical_cols] = test_data[categorical_cols].astype(str)

In [8]:
# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(test_data[categorical_cols])

In [9]:
# Convert encoded arrays to DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_cols))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded columns
X_train = X_train.drop(categorical_cols, axis=1)
X_test = test_data.drop(categorical_cols, axis=1)

In [10]:
# Feature scaling (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost Regressor model with best hyperparameters
print("XGBoost Regressor:")
xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=3,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=1.0,
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train)

XGBoost Regressor:


In [12]:
# Evaluate the model
y_pred_train = xgb_model.predict(X_train_scaled)
train_rmse = sqrt(mean_squared_error(y_train, y_pred_train))
print(f"Train RMSE: {train_rmse}")

# Predict for test data
xgb_pred = xgb_model.predict(X_test_scaled)

# Add 'id' to the predictions
test_data_with_id = test_data[['Id']].copy()
test_data_with_id['SalePrice'] = pd.Series(xgb_pred)  # Example using XGBoost predictions

# Save predictions to CSV with id
def save_predictions_with_id(predictions, test_data, filename):
    result_df = test_data[['Id']].copy()
    result_df['SalePrice'] = predictions
    result_df.to_csv(filename, index=False)

save_predictions_with_id(xgb_pred, test_data, 'xgb_predictions.csv')


Train RMSE: 10783.98529064083
