In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# 1.Load the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_ids = test_df['Id']

In [None]:
# 2. Basic Feature Engineering
def add_custom_features(df):
    # Calculate Total Square Footage
    df['TotalSF'] = df['TotalBsmtSF'].fillna(0) + df['1stFlrSF'].fillna(0) + df['2ndFlrSF'].fillna(0)
    # Convert some numeric columns that are actually categorical
    df['MSSubClass'] = df['MSSubClass'].astype(str)
    df['MoSold'] = df['MoSold'].astype(str)
    return df


In [None]:
# 3. Handle Outliers (Winsorization on Training Data)
# We only cap numerical columns to avoid extreme SalePrice skewing
numerical_cols = train_df.select_dtypes(include='number').columns.drop(['Id', 'SalePrice'])
for col in numerical_cols:
    Q1 = train_df[col].quantile(0.25)
    Q3 = train_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    train_df.loc[train_df[col] < lower_bound, col] = lower_bound
    train_df.loc[train_df[col] > upper_bound, col] = upper_bound


  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound
  train_df.loc[train_df[col] < lower_bound, col] = lower_bound


In [None]:
# 4. Prepare X and y
X = train_df.drop(columns=['Id', 'SalePrice'])
y = train_df['SalePrice']
X_test_final = test_df.drop(columns=['Id'])

In [None]:
# 5. Preprocessing Pipelines
# Identify feature types
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
])

In [None]:
# 6. Split Data for local validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 7. Define & Fit Pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        random_state=42,
        n_jobs=-1
    ))
])

model.fit(X_train, y_train)

In [None]:
# 8. Local Evaluation
val_predictions = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
r2 = r2_score(y_val, val_predictions)
print(f"--- Model Accuracy rate---")
print(f"R2 Score (Accuracy equivalent): {r2*100:.2f}%")
print(f"RMSE (Average error in price): ${rmse:.2f}")

--- Model Accuracy rate---
R2 Score (Accuracy equivalent): 99.99%
RMSE (Average error in price): $702.26


In [None]:
# 9. Final Training and Test Prediction
# Re-fit on the entire training set for better results
model.fit(X, y)
test_predictions = model.predict(X_test_final)



In [None]:
# 10. Prepare Submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved successfully.")

Submission file saved successfully.
