## Load the dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

train_file_path = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'
test_file_path = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'

train_df = pd.read_csv(train_file_path)
forecast_df = pd.read_csv(test_file_path)
train_df.head(5)

## House Price Distribution
Now, let's get statistical information about the numeric and non-numeric columns in our dataset.

In [None]:
train_df.describe(include = [np.number])

In [None]:
isnull_features = train_df.isna().sum().sort_values(ascending = False)
isnull_features[isnull_features >0]

The highest percentage of data is missing on Pool QC. After doing further analysis with the train data set, 
Pool area and pool QC both are dependent each other. Pool QS is null when pool area is 0.

### Handling Outlier

Outlier is an observation in a given dataset that lies far from the rest of the observations. In statistics, we have three measures of central tendency namely Mean, Median, and Mode. Based on statistics, lot area and GrlivArea have high cost of mean.  

In [None]:
train_df.describe(include = [np.number]).transpose().sort_values(by= 'mean', ascending = False)

In [None]:
print(train_df['TotalBsmtSF'].sort_values(ascending = False).head(5))
print(train_df['1stFlrSF'].sort_values(ascending = False).head(5))
print(train_df['GrLivArea'].sort_values(ascending = False).head(5))
print(train_df['LotArea'].sort_values(ascending = False).head(5))

Plot the scatter plot to visualize the outlier and delete outlier from data set.

In [None]:
# clear outliers
from matplotlib import pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(ncols=4, nrows=2, figsize=(20,3))
axes = np.ravel(axes)
#High mean cols names more than 1000 excepy years
col_names=['TotalBsmtSF','1stFlrSF', 'GrLivArea','LotArea']
for i, c in zip(range(5), col_names):
    train_df.plot.scatter(ax=axes[i], x=c, y='SalePrice', sharey=True, colorbar=False, c='r')
print(train_df.shape)

train_df = train_df[train_df['TotalBsmtSF'] < 3000]
train_df = train_df[train_df['1stFlrSF'] < 2500]
train_df = train_df[train_df['GrLivArea'] < 4000]
train_df = train_df[train_df['LotArea'] < 100000]
print(train_df.shape)

for i, c in zip(range(4,9), col_names):
    train_df.plot.scatter(ax=axes[i], x=c, y='SalePrice', sharey=True, colorbar=False, c='b')

### Correlation
We want to see how the dataset variables are correlated with each other and how predictor variables are correlated with the target variable. Spearman’s Rank Correlation is a statistical measure of the strength and direction of the monotonic relationship between two continuous variables

In [None]:
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.pyplot as plt


anscombe_data = train_df.drop('SalePrice', axis=1)
# Selecting four sets of 11 data points
subset_data = anscombe_data[anscombe_data.columns]

score = []
for col in subset_data:
    corr, pval = spearmanr(anscombe_data[col], train_df['SalePrice'])
    score.append([col, corr, pval])

spearmanr_score =  pd.DataFrame(score, columns=['Field','correlation', 'p-value'])
spearmanr_score = spearmanr_score[spearmanr_score['correlation'] > 0.5].sort_values(by = 'correlation', 
                                                                                        ascending = False)
#spearmanr_score = spearmanr_score.sort_values(by = 'p-value',ascending = False)
spearmanr_score.reset_index(drop=True, inplace=True)
spearmanr_score

In [None]:
x = spearmanr_score['Field'].tolist()
g = sns.PairGrid(train_df, y_vars=["SalePrice"], x_vars=x);
g.map(plt.scatter, color="orange", edgecolors="#000000", linewidths=0.5);
#taking a copy of data set before start cleaning data.
train = train_df
test = forecast_df

### Feature Engineering

In [None]:
from datetime import datetime

current_year = datetime.now().year
train['ageofhouse'] = current_year - train['YearBuilt']
train['ageofgrg'] = current_year - train['GarageYrBlt']
train['nooffullbath'] = train['BsmtFullBath']+ train['FullBath']
train['nohalfbath'] = train['BsmtHalfBath']+ train['HalfBath']

test['ageofhouse'] = current_year - test['YearBuilt']
test['ageofgrg'] = current_year - test['GarageYrBlt']
test['nooffullbath'] = test['BsmtFullBath']+ train['FullBath']
test['nohalfbath'] = test['BsmtHalfBath']+ train['HalfBath']


train.drop(['YearBuilt','GarageYrBlt','BsmtFullBath','FullBath','BsmtHalfBath','HalfBath'], inplace = True, axis=1)
test.drop(['YearBuilt', 'GarageYrBlt','BsmtFullBath','FullBath','BsmtHalfBath','HalfBath'], inplace = True, axis=1)
train.head()

### Data pre-process

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Separate features and target variable
X = train.drop(columns=["SalePrice"])
y = train["SalePrice"] 


# Define categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features =X.select_dtypes(exclude=['object']).columns.tolist()

# Define preprocesing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

### Model validation before Tuning.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def ML_models(model, X_train, X_test, y_train, y_test, preprocessor):
    
    regr_trans = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', model)])
    regr_trans.fit(X_train, y_train)
    yhat = regr_trans.predict(X_test)
    algoname= model.__class__.__name__

    scores = cross_val_score(regr_trans, X, y, cv=5)
    mean_accuracy = scores.mean()
    
    return (algoname, round(r2_score(y_test, yhat),3), np.sqrt(mean_squared_error(y_test, yhat)),
            mean_accuracy )

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgbm
import xgboost as xg
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.linear_model import Ridge
from sklearn import svm
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor

#params = {'model__verbose': -1 }

algo=[GradientBoostingRegressor(), 
      lgbm.LGBMRegressor(), 
      xg.XGBRFRegressor(),
      xg.XGBRegressor(),
      linear_model.LinearRegression(), 
      RandomForestRegressor(),
      DecisionTreeRegressor(), 
      linear_model.Lasso(),
      Ridge(),
      svm.SVR(),
      CatBoostRegressor(verbose=False),
      linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
      ]

score=[]

for model in algo:
    score.append(ML_models(model, X_train, X_test, y_train, y_test, preprocessor))

In [None]:
print(pd.DataFrame(score, columns=['Model', 'R2-Score', 'RMSE', 'cross_val_score' ]).
      sort_values(by='cross_val_score', ascending = False))

The Best models are GradientBoostingRegressor, LGBMRegressor, CatBoostRegressor

In [None]:
def gridsearch(pipeline, parameter_space, X_train, y_train):
    # Perform GridSearchCV
    clf = GridSearchCV(pipeline, 
                       parameter_space, 
                       cv=5, 
                       scoring='neg_root_mean_squared_error' 
                       #,verbose=2
                      )
    clf.fit(X_train, y_train)
    return clf.best_params_

#### CatBoostRegressor

In [None]:
model = CatBoostRegressor(verbose=False)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])


parameter_space = {'regressor__depth' : [6,8,10],
                  'regressor__learning_rate' : [0.01, 0.05, 0.1],
                  'regressor__iterations'    : [30, 50, 100]
              }

clf = GridSearchCV(estimator=pipeline, 
                   param_grid = parameter_space, 
                   cv = 5, 
                   n_jobs=-1, 
                   scoring='neg_root_mean_squared_error')
clf.fit(X, y)
print('Best parms: ', clf.best_params_)
#print('Best Estimators: ', clf.best_estimator_)

y_hat = clf.best_estimator_.predict(X)

print('RMSE: %.2f' % np.sqrt(mean_squared_error(y, y_hat)))
print('R2 Score: %.2f' % r2_score(y, y_hat),3) 

#### GradientBoostingRegressor

In [None]:
#GradientBoostingRegressor
model = GradientBoostingRegressor()
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])

parameter_space = {
                    "regressor__learning_rate": [0.0001, 0.001, 0.01, 0.1, 1.0],
                    "regressor__n_estimators": [5, 10, 50],
                    "regressor__subsample": [0.5, 0.7, 1.0],
                    "regressor__max_depth": [3, 5]
                }

clf = GridSearchCV(estimator=pipeline, 
                   param_grid = parameter_space, 
                   cv = 5, 
                   n_jobs=-1, 
                   scoring='neg_root_mean_squared_error')
clf.fit(X, y)
print('Best parms: ', clf.best_params_)
#print('Best Estimators: ', clf.best_estimator_)

y_hat = clf.best_estimator_.predict(X)

print('RMSE: %.2f' % np.sqrt(mean_squared_error(y, y_hat)))
print('R2 Score: %.2f' % r2_score(y, y_hat),3) 

#### LGBRegressor

In [None]:
#LGBRegressor 
model = lgbm.LGBMRegressor(verbosity=-1)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])

parameter_space = {
                    'regressor__num_leaves': [7, 14, 21, 28, 31, 50],
                    'regressor__learning_rate': [0.1, 0.03, 0.003],
                    'regressor__max_depth': [-1, 3, 5],
                    'regressor__n_estimators': [50, 100, 200, 500],
                }

clf = GridSearchCV(estimator=pipeline, 
                   param_grid = parameter_space, 
                   cv = 5, 
                   n_jobs=-1, 
                   scoring='neg_root_mean_squared_error')
clf.fit(X, y)
print('Best parms: ', clf.best_params_)
#print('Best Estimators: ', clf.best_estimator_)

y_hat = clf.best_estimator_.predict(X)

print('RMSE: %.2f' % np.sqrt(mean_squared_error(y, y_hat)))
print('R2 Score: %.2f' % r2_score(y, y_hat),3) 


#### RidgeRegressor

In [None]:
#Ridge
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', Ridge(random_state=3))])
parameter_space = {
    "regressor__alpha": [1, 10, 100],
    "regressor__fit_intercept": [True, False],
    "regressor__solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
}

clf = GridSearchCV(pipeline, parameter_space, cv=5, scoring='neg_root_mean_squared_error')
clf.fit(X, y)

y_hat = clf.best_estimator_.predict(X)

print('Best parms: ', clf.best_params_)
print('RMSE: %.2f' % np.sqrt(mean_squared_error(y, y_hat)))
print('R2 Score: %.2f' % r2_score(y, y_hat),3) 

#### LassoRegressor

In [None]:
#Lasso
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', linear_model.Lasso())])

parameter_space = {
                    "regressor__alpha": [0.1, 1, 10, 100],
                    "regressor__fit_intercept": [True, False],
                    "regressor__precompute": [True, False],
                    "regressor__copy_X": [True, False],
                    "regressor__selection": ['cyclic']
                   }


clf = GridSearchCV(pipeline, parameter_space, cv=5, scoring='neg_root_mean_squared_error')
clf.fit(X, y)
y_hat = clf.best_estimator_.predict(X)

print('Best Parm: ', clf.best_params_)
print('RMSE: %.2f' % np.sqrt(mean_squared_error(y, y_hat)))
print('R2 Score: %.2f' % r2_score(y, y_hat),3) 

#### RandomForestRegressor

In [None]:
#Randomeforest

#from sklearn.metrics import mean_squared_error
#from sklearn.model_selection import GridSearchCV
# Create a pipeline with preprocessing and RandomForestRegressor model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(random_state=42))])

# Define parameter grid for GridSearchCV
parameter_space = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30]
}

clf = GridSearchCV(pipeline, parameter_space, cv=5, scoring='neg_root_mean_squared_error')
clf.fit(X, y)

# Get the best model from grid search
best_model = clf.best_estimator_

y_hat = clf.best_estimator_.predict(X)
print('Best Parameter:', clf.best_params_)
print('RMSE: %.2f' % np.sqrt(mean_squared_error(y, y_hat)))
print('R2 Score: %.2f' % r2_score(y, y_hat),3) 


Based on RMSE and R2-score, the Best model is - RandomForestRegressor

#### Model Prediction on test data & Submission

In [None]:
# Evaluate the models on the validation data
y_test_pred = best_model.predict(test)

#Create a DataFrame with the predicted values
predictions_df = pd.DataFrame({'Id': test['Id'], 'SalePrice':y_test_pred})

# Print or display the DataFrame
print("Test dataset with predicted sale prices:")
print(predictions_df)
predictions_df.to_csv('submission.csv', index=False)