# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Loading the dataset...

In [None]:
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# EDA (Exploratory Data Analysis)

In [None]:
from IPython.display import display, HTML

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

# Display the first few rows of train_data in a table
train_table_html = train_data.head().to_html()

# Display the first few rows of test_data in a table
test_table_html = test_data.head().to_html()

# Display the tables side by side
display(HTML("<h3>Train Data</h3>"))
display(HTML(train_table_html))

display(HTML("<h3>Test Data</h3>"))
display(HTML(test_table_html))


In [None]:
train_data.dtypes

In [None]:
test_data.dtypes

# Feature Engineering
This refrence helped me a lot https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python.

In [None]:
#descriptive statistics summary
train_data['SalePrice'].describe()

In [None]:
#histogram
sns.distplot(train_data['SalePrice']);

* Deviate from the normal distribution.
* Have appreciable positive skewness.
* Show peakedness.

In [None]:
#skewness and kurtosis
print("Skewness: %f" % train_data['SalePrice'].skew())
print("Kurtosis: %f" % train_data['SalePrice'].kurt())

#### Relationship with numerical variables

In [None]:
#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

It seems that `SalePrice` and `GrLivArea` have a linear relationship

In [None]:
#scatter plot totalbsmtsf/saleprice
var = 'TotalBsmtSF'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

#### Relationship with categorical features

In [None]:
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
var = 'YearBuilt'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);

Note: we don't know if `SalePrice` is in constant prices. Constant prices try to remove the effect of inflation. If `SalePrice` is not in constant prices, it should be, so than prices are comparable over the years

In summary
, we can conclude that:

`GrLivArea` and `TotalBsmtSF` seem to be linearly related with `SalePrice`. Both relationships are positive, which means that as one variable increases, the other also increases. In the case of `TotalBsmtSF`, we can see that the slope of the linear relationship is particularly high.
`OverallQual` and 'YearBuilt' also seem to be related with `SalePrice`. The relationship seems to be stronger in the case of `OverallQual`, where the box plot shows how sales prices increase with the overall quality.
We just analysed four variables, but there are many other that we should analyse. The trick here seems to be the choice of the right features (feature selection) and not the definition of complex relationships between them (feature engineering).

* `SalePrice` correlation matrix (zoomed heatmap style).
 
* Scatter plots between the most correlated variables.

#### 'SalePrice' correlation matrix (zoomed heatmap style)

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Check if 'SalePrice' column exists
if 'SalePrice' not in train_data.columns:
    raise ValueError("'SalePrice' column is missing in the dataset")

# Select numerical and categorical columns
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_data.select_dtypes(include=['object']).columns

# Encode categorical variables (one-hot encoding)
encoded_categorical_cols = pd.get_dummies(train_data[categorical_cols])

# Concatenate numerical and encoded categorical columns
encoded_data = pd.concat([train_data[numerical_cols], encoded_categorical_cols], axis=1)

# Calculate correlation matrix
corr_matrix = encoded_data.corr().abs()

# Select top 10 most correlated features
target = 'SalePrice'
top_corr_features = corr_matrix[target].sort_values(ascending=False).head(21).index.tolist()

# Filter correlation matrix to include only top features
filtered_corr_matrix = corr_matrix.loc[top_corr_features, top_corr_features]

# Create annotated heatmap using Plotly's Heatmap trace
fig = go.Figure(data=go.Heatmap(
    z=filtered_corr_matrix.values,
    x=top_corr_features,
    y=top_corr_features,
    colorscale='RdYlGn',
    colorbar=dict(title='Correlation'),
    hoverongaps=False
))

fig.update_layout(
    title='Top 20 Most Correlated Features with SalePrice Heatmap',
    title_x=0.5,
    xaxis=dict(tickangle=45, automargin=True),
    yaxis=dict(tickangle=0, automargin=True),
    width=800,
    height=800,
    font=dict(size=12)
)

# Add hover information (correlation values)
hover_text = []
for i in range(len(top_corr_features)):
    row = []
    for j in range(len(top_corr_features)):
        row.append(f'Correlation: {filtered_corr_matrix.values[i][j]:.2f}<br>{top_corr_features[i]} vs {top_corr_features[j]}')
    hover_text.append(row)

fig.update_traces(hoverinfo='text', text=hover_text, hovertemplate='%{text}')

fig.show()

#### Scatter plots between 'SalePrice' and correlated variables

In [None]:
#scatterplot
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train_data[cols], size = 2.5)
plt.show();

# Missing data

In [None]:
#missing data
total = train_data.isnull().sum().sort_values(ascending=False)
percent = (train_data.isnull().sum()/train_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Let's analyse this to understand how to handle the missing data.

We'll consider that when more than 15% of the data is missing, we should delete the corresponding variable and pretend it never existed. This means that we will not try any trick to fill the missing data in these cases. According to this, there is a set of variables (e.g. 'PoolQC', 'MiscFeature', 'Alley', etc.) that we should delete. The point is: will we miss this data? I don't think so. None of these variables seem to be very important, since most of them are not aspects in which we think about when buying a house (maybe that's the reason why data is missing?). Moreover, looking closer at the variables, we could say that variables like 'PoolQC', 'MiscFeature' and 'FireplaceQu' are strong candidates for outliers, so we'll be happy to delete them.


Regarding 'MasVnrArea' and 'MasVnrType', we can consider that these variables are not essential. Furthermore, they have a strong correlation with 'YearBuilt' and 'OverallQual' which are already considered. Thus, we will not lose information if we delete 'MasVnrArea' and 'MasVnrType'.

Finally, we have one missing observation in 'Electrical'. Since it is just one observation, we'll delete this observation and keep the variable.

In summary, to handle missing data, we'll delete all the variables with missing data, except the variable 'Electrical'. In 'Electrical' we'll just delete the observation with missing data.

In [None]:
# Identify columns with any missing data
missing_data = train_data.isnull().sum()

# Drop all columns with missing data except 'Electrical'
cols_to_drop = missing_data[missing_data > 0].index.difference(['Electrical'])
train_data = train_data.drop(columns=cols_to_drop)

# Drop rows where 'Electrical' has missing data
train_data = train_data.dropna(subset=['Electrical'])

train_data.isnull().sum().max() #just checking that there's no missing data missing...

## Outliers

Outliers can markedly affect our models and can be a valuable source of information, providing us insights about specific behaviours.

Outliers is a complex subject and it deserves more attention. Here, we'll just do a quick analysis through the standard deviation of `SalePrice` and a set of scatter plots.

### Univariate analysis

The primary concern here is to establish a threshold that defines an observation as an outlier. To do so, we'll standardize the data. In this context, data standardization means converting data values to have mean of 0 and a standard deviation of 1.

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Standardizing data
saleprice_scaled = StandardScaler().fit_transform(np.array(train_data['SalePrice']).reshape(-1, 1));
low_range = saleprice_scaled[saleprice_scaled[:, 0].argsort()][:10]
high_range = saleprice_scaled[saleprice_scaled[:, 0].argsort()][-10:]

print('Outer range (low) of the distribution:')
print(low_range)
print('\nOuter range (high) of the distribution:')
print(high_range)

* Low range values are similar and not too far from 0.
* High range values are far from 0 and the 7.something values are really out of range.

For now, we'll not consider any of these values as an outlier but we should be careful with those values.

### Bivariate analysis

In [None]:
#bivariate analysis saleprice/grlivarea
var = 'GrLivArea'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

What has been revealed:

* The two values with bigger 'GrLivArea' seem strange and they are not following the crowd. We can speculate why this is happening. Maybe they refer to agricultural area and that could explain the low price. I'm not sure about this but I'm quite confident that these two points are not representative of the typical case. Therefore, we'll define them as outliers and delete them.

* The two observations in the top of the plot are those 7.something observations that we said we should be careful about. They look like two special cases, however they seem to be following the trend. For that reason, we will keep them.

In [None]:
#deleting points
train_data.sort_values(by = 'GrLivArea', ascending = False)[:2]
train_data = train_data.drop(train_data[train_data['Id'] == 1299].index)
train_data = train_data.drop(train_data[train_data['Id'] == 524].index)

In [None]:
#bivariate analysis saleprice/GrLivArea
var = 'TotalBsmtSF'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

In [None]:
#deleting points
var = 'GrLivArea'
filtered_data = train_data[train_data[var] <= 3000]
data = pd.concat([filtered_data['SalePrice'], filtered_data[var]], axis=1)


## Normality
> When we talk about normality what we mean is that the data should look like a normal distribution. This is important because several statistic tests rely on this.
> we'll just check univariate normality for 'SalePrice' (which is a limited approach). Remember that univariate normality doesn't ensure multivariate normality (which is what we would like to have), but it helps. Another detail to take into account is that in big samples (>200 observations) normality is not such an issue. However, if we solve normality, we avoid a lot of other problems (e.g. heteroscedacity) so that's the main reason why we are doing this analysis.

The point here is to test `SalePrice` in a very lean way. We'll do this paying attention to:

* Histogram - Kurtosis and skewness.

* Normal probability plot - Data distribution should closely follow the diagonal that represents the normal distribution.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, probplot

# Histogram and normal probability plot
sns.histplot(train_data['SalePrice'], kde=True, stat="density", linewidth=0, element='step', bins=30, color='blue')
sns.lineplot(x=sorted(train_data['SalePrice']), y=norm.pdf(sorted(train_data['SalePrice']), loc=train_data['SalePrice'].mean(), scale=train_data['SalePrice'].std()), color='red')

fig = plt.figure()
res = probplot(train_data['SalePrice'], plot=plt)
plt.show()

* `SalePrice` is not normal. It shows 'peakedness', positive skewness and does not follow the diagonal line.
*  A simple data transformation can solve the problem. in case of positive skewness, log transformations usually works well.

In [None]:
#applying log transformation
train_data['SalePrice'] = np.log(train_data['SalePrice'])

In [None]:
# Transformed histogram and normal probability plot
sns.histplot(train_data['SalePrice'], kde=True, stat="density", bins=30, color='blue')
mean, std = norm.fit(train_data['SalePrice'])
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mean, std)
plt.plot(x, p, 'k', linewidth=2, color='red')

fig = plt.figure()
res = probplot(train_data['SalePrice'], plot=plt)
plt.show()


* Done! Let's check what's going on with `GrLivArea`.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, probplot
import numpy as np

# Histogram and normal probability plot
sns.histplot(train_data['GrLivArea'], kde=True, stat="density", bins=30, color='blue')
mean, std = norm.fit(train_data['GrLivArea'])
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mean, std)
plt.plot(x, p, 'k', linewidth=2, color='red')

fig = plt.figure()
res = probplot(train_data['GrLivArea'], plot=plt)
plt.show()

* skewness deteted ! 

In [None]:
#data transformation
train_data['GrLivArea'] = np.log(train_data['GrLivArea'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, probplot
import numpy as np

# Histogram and normal probability plot
sns.histplot(train_data['GrLivArea'], kde=True, stat="density", bins=30, color='blue')
mean, std = norm.fit(train_data['GrLivArea'])
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mean, std)
plt.plot(x, p, 'k', linewidth=2, color='red')

fig = plt.figure()
res = probplot(train_data['GrLivArea'], plot=plt)
plt.show()

Now it's time to check `TotalBsmtSF`

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, probplot
import numpy as np

# Histogram and normal probability plot
sns.histplot(train_data['TotalBsmtSF'], kde=True, stat="density", bins=30, color='blue')
mean, std = norm.fit(train_data['TotalBsmtSF'])
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mean, std)
plt.plot(x, p, 'k', linewidth=2, color='red')

fig = plt.figure()
res = probplot(train_data['TotalBsmtSF'], plot=plt)
plt.show()

* Skewness is present
* A significant number of observations with value zero (houses without basement).
* A big problem because the value zero doesn't allow us to do log transformations.

To apply a log transformation here, we'll create a variable that can get the effect of having or not having basement (binary variable). Then, we'll do a log transformation to all the non-zero observations, ignoring those with value zero. This way we can transform data, without losing the effect of having or not basement.

In [None]:
#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
train_data['HasBsmt'] = pd.Series(len(train_data['TotalBsmtSF']), index=train_data.index)
train_data['HasBsmt'] = 0 
train_data.loc[train_data['TotalBsmtSF']>0,'HasBsmt'] = 1

In [None]:
#transform data
train_data.loc[train_data['HasBsmt']==1,'TotalBsmtSF'] = np.log(train_data['TotalBsmtSF'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, probplot
import numpy as np

# Filtering out the zero values in 'TotalBsmtSF'
filtered_data = train_data[train_data['TotalBsmtSF'] > 0]['TotalBsmtSF']

# Histogram and normal probability plot
sns.histplot(filtered_data, kde=True, stat="density", bins=30, color='blue')
mean, std = norm.fit(filtered_data)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mean, std)
plt.plot(x, p, 'k', linewidth=2, color='red')
plt.title('Histogram with Normal Fit')

fig = plt.figure()
res = probplot(filtered_data, plot=plt)
plt.title('Normal Probability Plot')
plt.show()

## Homoscedasticity

> Homoscedasticity refers to the assumption that "dependent variable(s) exhibit equal levels of variance across the range of predictor variable(s)" (Hair et al., 2013). Homoscedasticity is desirable because we want the error term to be the same across all values of the independent variables.

The best approach to test homoscedasticity for two metric variables is graphically. Departures from an equal dispersion are shown by such shapes as cones (small dispersion at one side of the graph, large dispersion at the opposite side) or diamonds (a large number of points at the center of the distribution).

Starting by 'SalePrice' and 'GrLivArea'...

In [None]:
#scatter plot
plt.scatter(train_data['GrLivArea'], train_data['SalePrice']);

Now let's check `SalePrice` with `TotalBsmtSF`.

In [None]:
#scatter plot
plt.scatter(train_data[train_data['TotalBsmtSF']>0]['TotalBsmtSF'], train_data[train_data['TotalBsmtSF']>0]['SalePrice']);

* We can say that, in general, `SalePrice` exhibit equal levels of variance across the range of `TotalBsmtSF`. Cool!

## Dummy Variables

[https://medium.com/analytics-vidhya/tutorial-exploratory-data-analysis-eda-with-categorical-variables-6a569a3aea55](http://)

In [None]:
#convert categorical variable into dummy
train_data = pd.get_dummies(train_data)

# Training the model
[https://www.kaggle.com/code/pavansanagapati/ensemble-learning-techniques-tutorial/notebook#Please-do-leave-your-comments-/suggestions-and-if-you-like-this-kernel-greatly-appreciate-to-UPVOTE-.](http://)

### Model Selection and Hyperparameter Tuning :

* **Choosing Models** : Various regression algorithms are selected based on their suitability for the task, including Lasso, ElasticNet, KernelRidge, GradientBoostingRegressor, XGBRegressor, LGBMRegressor, and MLPRegressor.


* **Hyperparameter Optimization** : Each model undergoes hyperparameter tuning using RandomizedSearchCV. 

 This involves :
   
   * Defining a hyperparameter grid (param_grids) specific to each model.
    
   *   Conducting randomized searches over the parameter grid to find the best combination that maximizes model performance, measured by cross-validated metrics.


### Ensemble Learning - Stacking :

* **Constructing StackingRegressor** : The best-tuned base models are combined using a StackingRegressor.

     This meta-estimator :
      
    * Aggregates predictions from multiple base models.
    
    * Uses a final estimator (e.g., Ridge, SVR, XGBRegressor, LGBMRegressor) to blend the predictions of base models into a single output.

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

#### Based on the distribution of data let us remove some of the outliers

In [None]:
train.drop(train[(train['GrLivArea'] >4000) & (train['SalePrice']<300000)].index,inplace = True)

Let us concatenate both the training and test datasets into a single dataframe for ease of data cleaning and feature engineering.`Id` feature has no significance to our modelling since it is a continuous variable ,so dropping this feature on both train and test datasets.

In [None]:
full = pd.concat([train,test],ignore_index=True)
full.drop('Id',axis = 1,inplace = True)
full.shape

In [None]:
missing_values = full.isnull().sum()
missing_values[missing_values>0].sort_values(ascending = False)

let us impute the missing values of LotFrontage based on the median of LotArea and Neighborhood. To achieve this let us first group Neighborhood and LotFrontage with respect to median,mean and count.

In [None]:
full.groupby(['Neighborhood'])[['LotFrontage']].agg(['mean','median','count'])

LotArea is a continuous feature so it is best to use panda's qcut method to divide it into 10 parts.

In [None]:
full['LotAreaCut'] = pd.qcut(full.LotArea,10)

full.groupby([full['LotAreaCut']])[['LotFrontage']].agg(['mean','median','count'])

So let us impute the missing values of LotFrontage as stated above with the median of LotArea and Neighborhood.

In [None]:
full['LotFrontage']= full.groupby(['LotAreaCut','Neighborhood'])['LotFrontage'].transform(lambda x : x.fillna(x.median()))
full['LotFrontage']= full.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x : x.fillna(x.median()))

Now let us recheck the missing values to see our LotFrontage missing values are imputed successfully.

In [None]:
missing_values = full.isnull().sum()

missing_values[missing_values>0].sort_values(ascending = False)

focus on numerical features with one missing value and replace them with 0

In [None]:
columns = ["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in columns:full[col].fillna(0,inplace= True)

focus on some of the categorical features with major count of missing values and replace them with 'None'

In [None]:
columns1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish",
"GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col1 in columns1:full[col1].fillna('None',inplace = True)

focus on some of the categorical features with fewer missing values and replace them with the most frequently occured value which is the mode of that feature.

In [None]:
columns2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional",
            "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]

for col2 in columns2:
    full[col2].fillna(full[col2].mode()[0],inplace = True)

recheck if we have any other missing values that needs to be imputed except the SalePrice for the test dataset which is the target variable to be determined.

In [None]:
full.isnull().sum()[full.isnull().sum()>0]

In [None]:
numeric_features = full.select_dtypes(include=[np.number])
numeric_features.columns

In [None]:
Numstr = ["MSSubClass","BsmtFullBath","BsmtHalfBath","HalfBath","BedroomAbvGr","KitchenAbvGr","MoSold",
          "YrSold","YearBuilt","YearRemodAdd","LowQualFinSF","GarageYrBlt"]

for i in Numstr:
    full[i]=full[i].astype(str)

In [None]:
full.groupby(['MSSubClass'])[['SalePrice']].agg(['mean','median','count'])

In [None]:
def map_values():
    full["oMSSubClass"] = full.MSSubClass.map({'180':1, 
                                        '30':2, '45':2, 
                                        '190':3, '50':3, '90':3, 
                                        '85':4, '40':4, '160':4, 
                                        '70':5, '20':5, '75':5, '80':5, '150':5,
                                        '120': 6, '60':6})
    
    full["oMSZoning"] = full.MSZoning.map({'C (all)':1, 'RH':2, 'RM':2, 'RL':3, 'FV':4})
    full["oNeighborhood"] = full.Neighborhood.map({'MeadowV':1,
                                               'IDOTRR':2, 'BrDale':2,
                                               'OldTown':3, 'Edwards':3, 'BrkSide':3,
                                               'Sawyer':4, 'Blueste':4, 'SWISU':4, 'NAmes':4,
                                               'NPkVill':5, 'Mitchel':5,
                                               'SawyerW':6, 'Gilbert':6, 'NWAmes':6,
                                               'Blmngtn':7, 'CollgCr':7, 'ClearCr':7, 'Crawfor':7,
                                               'Veenker':8, 'Somerst':8, 'Timber':8,
                                               'StoneBr':9,
                                               'NoRidge':10, 'NridgHt':10})
    
    full["oCondition1"] = full.Condition1.map({'Artery':1,
                                           'Feedr':2, 'RRAe':2,
                                           'Norm':3, 'RRAn':3,
                                           'PosN':4, 'RRNe':4,
                                           'PosA':5 ,'RRNn':5})
    
    full["oBldgType"] = full.BldgType.map({'2fmCon':1, 'Duplex':1, 'Twnhs':1, '1Fam':2, 'TwnhsE':2})
    
    full["oHouseStyle"] = full.HouseStyle.map({'1.5Unf':1, 
                                           '1.5Fin':2, '2.5Unf':2, 'SFoyer':2, 
                                           '1Story':3, 'SLvl':3,
                                           '2Story':4, '2.5Fin':4})
    
    full["oExterior1st"] = full.Exterior1st.map({'BrkComm':1,
                                             'AsphShn':2, 'CBlock':2, 'AsbShng':2,
                                             'WdShing':3, 'Wd Sdng':3, 'MetalSd':3, 'Stucco':3, 'HdBoard':3,
                                             'BrkFace':4, 'Plywood':4,
                                             'VinylSd':5,
                                             'CemntBd':6,
                                             'Stone':7, 'ImStucc':7})
    
    full["oMasVnrType"] = full.MasVnrType.map({'BrkCmn':1, 'None':1, 'BrkFace':2, 'Stone':3})
    
    full["oExterQual"] = full.ExterQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
    
    full["oFoundation"] = full.Foundation.map({'Slab':1, 
                                           'BrkTil':2, 'CBlock':2, 'Stone':2,
                                           'Wood':3, 'PConc':4})
    
    full["oBsmtQual"] = full.BsmtQual.map({'Fa':2, 'None':1, 'TA':3, 'Gd':4, 'Ex':5})
    
    full["oBsmtExposure"] = full.BsmtExposure.map({'None':1, 'No':2, 'Av':3, 'Mn':3, 'Gd':4})
    
    full["oHeating"] = full.Heating.map({'Floor':1, 'Grav':1, 'Wall':2, 'OthW':3, 'GasW':4, 'GasA':5})
    
    full["oHeatingQC"] = full.HeatingQC.map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
    
    full["oKitchenQual"] = full.KitchenQual.map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
    
    full["oFunctional"] = full.Functional.map({'Maj2':1, 'Maj1':2, 'Min1':2, 'Min2':2, 'Mod':2, 'Sev':2, 'Typ':3})
    
    full["oFireplaceQu"] = full.FireplaceQu.map({'None':1, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})
    
    full["oGarageType"] = full.GarageType.map({'CarPort':1, 'None':1,
                                           'Detchd':2,
                                           '2Types':3, 'Basment':3,
                                           'Attchd':4, 'BuiltIn':5})
    
    full["oGarageFinish"] = full.GarageFinish.map({'None':1, 'Unf':2, 'RFn':3, 'Fin':4})
    
    full["oPavedDrive"] = full.PavedDrive.map({'N':1, 'P':2, 'Y':3})
    
    full["oSaleType"] = full.SaleType.map({'COD':1, 'ConLD':1, 'ConLI':1, 'ConLw':1, 'Oth':1, 'WD':1,
                                       'CWD':2, 'Con':3, 'New':3})
    
    full["oSaleCondition"] = full.SaleCondition.map({'AdjLand':1, 'Abnorml':2, 'Alloca':2, 'Family':2, 'Normal':3, 'Partial':4})            
                
                        
                        
    
    return "Done!"

In [None]:
map_values()

In [None]:
# drop two unwanted columns
full.drop("LotAreaCut",axis=1,inplace=True)

full.drop(['SalePrice'],axis=1,inplace=True)

In [None]:
full[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

create a class for the LabelEncoder to fit and transform some of the identified features

In [None]:
from sklearn.base import BaseEstimator,TransformerMixin

class labenc(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        label = LabelEncoder()
        X['YearBuilt']=label.fit_transform(X['YearBuilt'])
        X['YearRemodAdd']=label.fit_transform(X['YearRemodAdd'])
        X['GarageYrBlt']=label.fit_transform(X['GarageYrBlt'])
        return X

In [None]:
class skewness(BaseEstimator,TransformerMixin):
    def __init__(self,skew=0.5):
        self.skew = skew
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X_numeric=X.select_dtypes(exclude=["object"])
        skewness = X_numeric.apply(lambda x: skew(x))
        skewness_features = skewness[abs(skewness) >= self.skew].index
        X[skewness_features] = np.log1p(X[skewness_features])
        return X

In [None]:
class dummies(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        X = pd.get_dummies(X)
        return X

Now we will use pipeline to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves two purposes here:

Convenience: You only have to call fit and predict once on your data to fit a whole sequence of estimators. Joint parameter selection: You can grid search over parameters of all estimators in the pipeline at once. All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('labenc',labenc()),('skewness',skewness(skew =1)),('dummies',dummies())])

In [None]:
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew

full_copy = full.copy()
data_pipeline = pipeline.fit_transform(full_copy)

In [None]:
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()

In [None]:
n_train = train.shape[0]
n_train

In [None]:
X= data_pipeline[:n_train]
y = train.SalePrice
test_X = data_pipeline[n_train:]
X.shape,y.shape,test_X.shape

In [None]:
X_scaled = robust_scaler.fit(X).transform(X)
y_log = np.log(train.SalePrice)
test_X_scaled = robust_scaler.transform(test_X)

In [None]:
X_scaled.shape,y_log.shape,test_X.shape

Now we will perform some feature selection like Lasso

In [None]:
class add_feature(BaseEstimator, TransformerMixin):
    def __init__(self,additional=1):
        self.additional = additional
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        if self.additional==1:
            X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]   
            X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]
            
        else:
            X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]   
            X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]
            
            X["+_TotalHouse_OverallQual"] = X["TotalHouse"] * X["OverallQual"]
            X["+_GrLivArea_OverallQual"] = X["GrLivArea"] * X["OverallQual"]
            X["+_oMSZoning_TotalHouse"] = X["oMSZoning"] * X["TotalHouse"]
            X["+_oMSZoning_OverallQual"] = X["oMSZoning"] + X["OverallQual"]
            X["+_oMSZoning_YearBuilt"] = X["oMSZoning"] + X["YearBuilt"]
            X["+_oNeighborhood_TotalHouse"] = X["oNeighborhood"] * X["TotalHouse"]
            X["+_oNeighborhood_OverallQual"] = X["oNeighborhood"] + X["OverallQual"]
            X["+_oNeighborhood_YearBuilt"] = X["oNeighborhood"] + X["YearBuilt"]
            X["+_BsmtFinSF1_OverallQual"] = X["BsmtFinSF1"] * X["OverallQual"]
            
            X["-_oFunctional_TotalHouse"] = X["oFunctional"] * X["TotalHouse"]
            X["-_oFunctional_OverallQual"] = X["oFunctional"] + X["OverallQual"]
            X["-_LotArea_OverallQual"] = X["LotArea"] * X["OverallQual"]
            X["-_TotalHouse_LotArea"] = X["TotalHouse"] + X["LotArea"]
            X["-_oCondition1_TotalHouse"] = X["oCondition1"] * X["TotalHouse"]
            X["-_oCondition1_OverallQual"] = X["oCondition1"] + X["OverallQual"]
            
           
            X["Bsmt"] = X["BsmtFinSF1"] + X["BsmtFinSF2"] + X["BsmtUnfSF"]
            X["Rooms"] = X["FullBath"]+X["TotRmsAbvGrd"]
            X["PorchArea"] = X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]
            X["TotalPlace"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"] + X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]

    
            return X

In [None]:
pipeline = Pipeline([('labenc',labenc()),('add_feature', add_feature(additional=2)),
                     ('skewness',skewness(skew =1)),('dummies',dummies())])

full_pipe = pipeline.fit_transform(full)
full_pipe.shape

In [None]:
n_train=train.shape[0]
X = full_pipe[:n_train]
test_X = full_pipe[n_train:]
y= train.SalePrice

X_scaled = robust_scaler.fit(X).transform(X)
y_log = np.log(train.SalePrice)
test_X_scaled = robust_scaler.transform(test_X)

In [None]:
print(X_scaled.shape)

In [None]:
# Now let us define Root Mean Square Error 
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model,X,y,scoring="neg_mean_squared_error",cv=5))
    return rmse

We choose 4 models and use 5-folds cross-calidation to evaluate these models.

**Models include** : 
* LinearRegression
* Ridge
* Lasso
* Random Forest

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, BayesianRidge
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

models = [LinearRegression(),
             Ridge(),
             Lasso(alpha=0.01,max_iter=10000),
             RandomForestRegressor(),
             GradientBoostingRegressor(),
             SVR(),
             LinearSVR(),
             ElasticNet(alpha = 0.001,max_iter=10000),
             SGDRegressor(max_iter=1000, tol = 1e-3),
             BayesianRidge(),
             KernelRidge(alpha=0.6,kernel='polynomial',degree = 2,coef0=2.5),
             ExtraTreesRegressor(),
             XGBRegressor()
             ]

In [None]:
names = ['LR','Ridge','Lasso','RF','GBR','SVR','LSVR','ENet','SGDR','BayRidge','Kernel','XTreeR','XGBR']

In [None]:
from sklearn.model_selection import cross_val_score

for model,name in zip(models,names):
    score = rmse_cv(model,X_scaled,y_log)
    print("{}: {:.6f}, {:4f}".format(name,score.mean(),score.std()))

In [None]:
from sklearn.base import RegressorMixin

# To define the average weight 
class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,model,weight):
        self.model = model
        self.weight = weight
        
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.model]
        for model in self.models_:
            model.fit(X,y)
        return self
    
    def predict(self,X):
        w = list()
        pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        for data in range(pred.shape[1]):
            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w

In [None]:
lasso = Lasso(alpha= 0.0005, max_iter= 10000)
ridge = Ridge(alpha=45, max_iter= 10000)
svr = SVR(C = 0.2, epsilon= 0.025, gamma = 0.0004, kernel = 'rbf')
ker = KernelRidge(alpha=0.15 ,kernel='polynomial',degree=3 , coef0=0.9)
ela = ElasticNet(alpha=0.0065,l1_ratio=0.075,max_iter=10000)
bay = BayesianRidge()

Finally to calculate the average weights let us look at the following code

In [None]:
from sklearn.base import clone

# Assign weights to all the above 6 models
w1 = 0.047
w2 = 0.2
w3 = 0.25
w4 = 0.3
w5 = 0.003
w6 = 0.2

weight_avg = AverageWeight(model = [lasso,ridge,svr,ker,ela,bay],weight=[w1,w2,w3,w4,w5,w6])
score = rmse_cv(weight_avg,X_scaled,y_log)
print(score.mean())

If we consider only two models then the score will vary

In [None]:
weight_avg = AverageWeight(model = [svr,ker],weight=[0.50,0.50])
score = rmse_cv(weight_avg,X_scaled,y_log)
print(score.mean())

## Stacking
Stacking is an ensemble learning technique that uses predictions from multiple models (for example decision tree, knn or svm) to build a new model. This model is used for making predictions on the test set.

In [None]:
# Define the stacking class
class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, mod, meta_model):
        self.mod = mod
        self.meta_model = meta_model
        self.kf = KFold(n_splits=5, random_state=42, shuffle=True)
        
    def fit(self, X, y):
        self.saved_model = [[] for _ in self.mod]
        oof_train = np.zeros((X.shape[0], len(self.mod)))
        
        for i, model in enumerate(self.mod):
            for train_index, val_index in self.kf.split(X, y):
                renew_model = clone(model)
                renew_model.fit(X[train_index], y[train_index])
                self.saved_model[i].append(renew_model)
                oof_train[val_index, i] = renew_model.predict(X[val_index])
        
        self.meta_model.fit(oof_train, y)
        return self
    
    def predict(self, X):
        whole_test = np.column_stack([np.column_stack([model.predict(X) for model in single_model]).mean(axis=1) 
                                      for single_model in self.saved_model]) 
        return self.meta_model.predict(whole_test)
    
    def get_oof(self, X, y, test_X):
        oof = np.zeros((X.shape[0], len(self.mod)))
        test_single = np.zeros((test_X.shape[0], 5))
        test_mean = np.zeros((test_X.shape[0], len(self.mod)))
        for i, model in enumerate(self.mod):
            for j, (train_index, val_index) in enumerate(self.kf.split(X, y)):
                clone_model = clone(model)
                clone_model.fit(X[train_index], y[train_index])
                oof[val_index, i] = clone_model.predict(X[val_index])
                test_single[:, j] = clone_model.predict(test_X)
            test_mean[:, i] = test_single.mean(axis=1)
        return oof, test_mean

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.base import clone
from sklearn.model_selection import KFold
import numpy as np
from sklearn.impute import SimpleImputer

X_scaled_imputed = SimpleImputer().fit_transform(X_scaled)
y_log_imputed = SimpleImputer().fit_transform(y_log.values.reshape(-1,1)).ravel()

# Initialize stacking model
stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)
score = rmse_cv(stack_model,X_scaled_imputed,y_log_imputed)
print(score.mean())

## Blending
Blending follows the same approach as stacking but uses only a holdout (validation) set from the train set to make predictions. In other words, unlike stacking, the predictions are made on the holdout set only. The holdout set and the predictions are used to build a model which is run on the test set. 

In [None]:
from sklearn.datasets import load_wine
# define dataset
X,y = load_wine().data,load_wine().target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=1)

In [None]:
x_val=pd.DataFrame(X_val)
x_test=pd.DataFrame(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)
val_pred1=model1.predict(X_val)
test_pred1=model1.predict(X_test)
val_pred1=pd.DataFrame(val_pred1)
test_pred1=pd.DataFrame(test_pred1)

model2 = KNeighborsClassifier()
model2.fit(X_train,y_train)
val_pred2=model2.predict(X_val)
test_pred2=model2.predict(X_test)
val_pred2=pd.DataFrame(val_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [None]:
from sklearn.linear_model import LogisticRegression

df_val=pd.concat([x_val, val_pred1,val_pred2],axis=1)
df_test=pd.concat([x_test, test_pred1,test_pred2],axis=1)

model = LogisticRegression()
model.fit(df_val,y_val)
model.score(df_test,y_test)

## Bagging
Bagging, is shorthand for the combination of bootstrapping and aggregating. Bootstrapping is a method to help decrease the variance of the classifier and reduce overfitting, by resampling data from the training set with the same cardinality as the original set. The model created should be less overfitted than a single individual model.

There are three main terms describing the ensemble (combination) of various models into one more effective model:

* **Bagging** to decrease the model’s variance;
* **Boosting** to decreasing the model’s bias, and;
* **Stacking** to increasing the predictive force of the classifier.

In [None]:
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# define dataset
X,y = load_wine().data,load_wine().target

Sklearn’s **VotingRegressor** allows you to combine different machine learning Regressors.

## Boosting
The main idea of boosting is to add additional models to the overall ensemble model sequentially.

Previously with bagging, we averaged each individual model created. This time with each iteration of boosting, a new model is created and the new base-learner model is trained (updated) from the errors of the previous learners.

In [None]:
from sklearn.datasets import load_wine
# define dataset
X,y = load_wine().data,load_wine().target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

**Adaptive boosting** or **AdaBoost** is one of the simplest boosting algorithms.
Usually, decision trees are used for modelling. Multiple sequential models are created, each correcting the errors from the last model.

AdaBoost assigns weights to the observations which are incorrectly predicted and the subsequent model works to predict these values correctly.

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada_boost = AdaBoostRegressor(random_state=1)
ada_boost.fit(X_train, y_train)
ada_boost.score(X_test, y_test)

**Gradient Boosting or GBM**

It is another ensemble machine learning algorithm that works for both regression and classification problems.

GBM uses the boosting technique, combining a number of weak learners to form a strong learner. Regression trees used as a base learner, each subsequent tree in series is built on the errors calculated by the previous tree.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

grad_boost = GradientBoostingRegressor(learning_rate=0.01, random_state=1)
grad_boost.fit(X_train, y_train)
grad_boost.score(X_test,y_test)

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor


xgb_boost = XGBRegressor(random_state=1, learning_rate=0.01)
xgb_boost.fit(X_train, y_train)
xgb_boost.score(X_test,y_test)

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score

# Define RMSE scorer (root mean squared error)
scorer = make_scorer(mean_squared_error, squared=False)

eclf = VotingRegressor(estimators=[
    ('Ada Boost', ada_boost),
    ('Grad Boost', grad_boost),
    ('XG Boost', xgb_boost)], 
    weights=[1,1,1])

# List of regressors
regressors = [ada_boost, grad_boost, xgb_boost, eclf]

# Loop through regressors and evaluate using RMSE
for reg, label in zip(regressors, ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']):
    scores = cross_val_score(reg, X, y, cv=10, scoring=scorer)
    print("RMSE: %0.2f (+/- %0.2f) [%s]" % (-scores.mean(), scores.std(), label))

In [None]:
eclf.fit(X_train, y_train)

# Evaluate using cross-validation and RMSE
scores = cross_val_score(eclf, X, y, cv=10, scoring=scorer)
print("RMSE: %0.2f (+/- %0.2f) [%s]" % (-scores.mean(), scores.std(), 'Ensemble'))

## Submission

In [None]:
sample_submission = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
sample_submission.head()

In [None]:
from sklearn.ensemble import VotingRegressor
import numpy as np

class CustomVotingRegressor(VotingRegressor):
    def __init__(self, estimators, weights):
        super().__init__(estimators, weights)
    
    def predict(self, X):
        # Assuming self.saved_model and self.meta_model are defined appropriately
        whole_test = np.column_stack([np.column_stack([model.predict(X) for model in single_model]).mean(axis=1) 
                                      for single_model in self.saved_model]) 
        return self.meta_model.predict(whole_test)

In [None]:
## Import necessary libraries
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from mlxtend.regressor import StackingCVRegressor
from sklearn.metrics import mean_squared_error

# Load the test dataset
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# Extract the Id column from the test dataset
test_ID = test_data['Id']

# Assuming X_train, X_test, y_log are defined and preprocessed

# Convert numpy arrays to pandas DataFrames or Series
X_train_df = pd.DataFrame(X_train)
y_log_series = pd.Series(y_log)

# Check initial shapes
print("Initial X_train shape:", X_train_df.shape)
print("Initial y_train shape:", y_log_series.shape)

# Ensure X_train and y_log have the same number of samples
if X_train_df.shape[0] != y_log_series.shape[0]:
    print(f"Mismatch in number of samples: X_train {X_train_df.shape[0]}, y_log {y_log_series.shape[0]}")
    # Handle mismatch, e.g., by aligning indices
    common_indices = X_train_df.index.intersection(y_log_series.index)
    X_train_df = X_train_df.loc[common_indices]
    y_log_series = y_log_series.loc[common_indices]
    print("Aligned X_train shape:", X_train_df.shape)
    print("Aligned y_train shape:", y_log_series.shape)

# Assuming you used the following preprocessing for X_train:
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

# Fit the imputer and scaler on the training data
X_train_imputed = imputer.fit_transform(X_train_df)
X_train_scaled = scaler.fit_transform(X_train_imputed)

# Apply the same preprocessing to X_test
X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)


lasso = Lasso(alpha= 0.0005, max_iter= 10000)
ridge = Ridge(alpha=45, max_iter= 10000)
svr = SVR(C = 0.2, epsilon= 0.025, gamma = 0.0004, kernel = 'rbf')
ker = KernelRidge(alpha=0.15 ,kernel='polynomial',degree=3 , coef0=0.9)
ela = ElasticNet(alpha=0.0065,l1_ratio=0.075,max_iter=10000)
bay = BayesianRidge()


from sklearn.base import clone

# Assign weights to all the above 6 models
w1 = 0.047
w2 = 0.2
w3 = 0.25
w4 = 0.3
w5 = 0.003
w6 = 0.2

weight_avg = AverageWeight(model = [lasso,ridge,svr,ker,ela,bay],weight=[w1,w2,w3,w4,w5,w6])

# Create the stacking model
stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)

# Fit the stacking model
stack_model.fit(X_train_scaled, y_log_series)

# Predictions from stacking model
stack_pred_log = stack_model.predict(X_test_scaled)

# Inverse transform the stacking predictions
stack_pred = np.expm1(stack_pred_log)

# Bagging VotingRegressor with only ExtraTreesRegressor
et = ExtraTreesRegressor()

# Create VotingRegressor
bagging_regressor = VotingRegressor(estimators=[('Extra Trees', et)])

# Fit bagging model
bagging_regressor.fit(X_train_scaled, y_log_series)

# Predictions from bagging model
bagging_pred_log = bagging_regressor.predict(X_test_scaled)

# Inverse transform the bagging predictions
bagging_pred = np.expm1(bagging_pred_log)

# Boosting AdaBoostRegressor
ada_boost = AdaBoostRegressor()

# Fit boosting model
ada_boost.fit(X_train_scaled, y_log_series)

# Predictions from boosting model
boosting_pred_log = ada_boost.predict(X_test_scaled)

# Inverse transform the boosting predictions
boosting_pred = np.expm1(boosting_pred_log)

# Calculate ensemble prediction from all models
ensemble_pred = (stack_pred * 0.75 +
                 bagging_pred * 0.15 +
                 boosting_pred * 0.15)

# Verify alignment between test_ID and ensemble_pred
print("Shape of X_test:", X_test.shape)
print("Length of ensemble_pred:", len(ensemble_pred))
print("Length of test_ID:", len(test_ID))

# Ensure 'test_ID' is correctly assigned to 'sub' DataFrame
sub = pd.DataFrame()
sub['Id'] = test_ID[:len(ensemble_pred)]  # Use only the IDs corresponding to predictions
sub['SalePrice'] = ensemble_pred

# Save submission file
sub.to_csv('submission.csv', index=False)

# Read the CSV file into a DataFrame
submission = pd.read_csv('submission.csv')

# Display the first few rows of the DataFrame
print(submission.head())

# Display summary statistics or more details
print(submission.describe())  # Summary statistics
print(submission.info())      # DataFrame information
