In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
house_train_data = pd.read_csv('/content/train hp.csv')

In [None]:
house_train_data.head()

In [None]:
#house_test_data.head()

In [None]:
house_train_data.shape

In [None]:
print(house_train_data.columns[house_train_data.isna().any()].tolist())
len(house_train_data.columns[house_train_data.isna().any()].tolist())

In [None]:
features = [x for x in house_train_data.columns if x not in ['SalePrice']]
X = house_train_data[features]
y = house_train_data['SalePrice']

In [None]:
corr_matrix = house_train_data.corr()

In [None]:
corr_matrix["Id"].sort_values(ascending = False)

In [None]:
from pandas.plotting import scatter_matrix 
attributes = ["Id","PoolArea","BedroomAbvGr","TotRmsAbvGrd","MoSold","GarageArea",       
"GarageCars","OverallCond","MSSubClass","1stFlrSF","GrLivArea","HalfBath",         
"2ndFlrSF","FullBath","KitchenAbvGr","EnclosedPorch","BsmtFullBath","ScreenPorch","YrSold",           
"GarageYrBlt","OpenPorchSF","BsmtFinSF1","BsmtFinSF2","MiscVal",         
"BsmtUnfSF","LotFrontage","YearBuilt","TotalBsmtSF","Fireplaces","BsmtHalfBath",    
"SalePrice","YearRemodAdd","OverallQual","WoodDeckSF","LotArea","LowQualFinSF",    
"3SsnPorch","MasVnrArea"]
scatter_matrix(house_train_data[attributes], figsize = (100,100))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]


numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(random_state=0)

tree_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('tree_model', tree_model)
                     ])

tree_clf.fit(X_train, y_train)

tree_clf.fit(X_train, y_train)

tree_preds = tree_clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, tree_preds))

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_model = RandomForestRegressor(random_state=0)

random_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('random_model', random_model)
                     ])

random_clf.fit(X_train, y_train)

random_clf.fit(X_train, y_train)

random_preds = random_clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, random_preds))

In [None]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.07, random_state=0)

xgb_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('xgb_model', xgb_model)
                     ])

xgb_clf.fit(X_train, y_train, xgb_model__verbose=False)

xgb_clf.fit(X_train, y_train)

xgb_preds = xgb_clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, xgb_preds))

In [None]:
'''from sklearn.model_selection import GridSearchCV
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

grid = GridSearchCV(xgb_model, param_grid=params, n_jobs=4, cv=5, verbose=3 )
grid.fit(param_X, y)
print('\n All results:')
print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)'''

In [None]:
hp_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.5, gpu_id=1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.02, max_delta_step=0, max_depth=4,
             min_child_weight=1, monotone_constraints=1,
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=0)

hp_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('hp_model', hp_model)
                     ])

hp_clf.fit(X_train, y_train, hp_model__verbose=0)

hp_preds = hp_clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, hp_preds))

In [None]:
X.columns.to_list()

In [None]:
print(X['YearBuilt'].head())
print(X['YearRemodAdd'].head())

In [None]:
print(X['LotArea'].head())
print(X['LotFrontage'].head())

In [None]:
print(set(X['LandSlope']))
print(set(X['LandContour']))

In [None]:
print(set(X['YrSold']))
print(set(X['MoSold']))

In [None]:
print(set(X['Condition1']))
print(set(X['Condition2']))

In [None]:
print(set(X['ExterQual']))
print(set(X['ExterCond']))

In [None]:
X_feat_eng = X.copy()
X_feat_eng['years_since_update'] = X_feat_eng['YearRemodAdd'] - X_feat_eng['YearBuilt']
X_feat_eng['geometry'] = X_feat_eng['LotArea'] / X_feat_eng['LotFrontage']
X_feat_eng['land_topology'] = X_feat_eng['LandSlope'] + '_' + X_feat_eng['LandContour']

feature_numerical_cols = [cname for cname in X_feat_eng.columns if 
                X_feat_eng[cname].dtype in ['int64', 'float64']]

feature_categorical_cols = [cname for cname in X_feat_eng.columns if
                    X_feat_eng[cname].nunique() < 13 and 
                    X_feat_eng[cname].dtype == "object"]


feature_numerical_transformer = SimpleImputer(strategy='constant')

feature_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

feature_preprocessor = ColumnTransformer(
    transformers=[
        ('num', feature_numerical_transformer, feature_numerical_cols),
        ('cat', feature_categorical_transformer, feature_categorical_cols)
])

feature_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.0, gpu_id=1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.02, max_delta_step=0, max_depth=4,
             min_child_weight=0.0, monotone_constraints='1',
             n_estimators=1250, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=0)
feature_clf = Pipeline(steps=[('feature_preprocessor', feature_preprocessor),
                      ('feature_model', feature_model)
                     ])

feature_X_train, feature_X_valid, feature_y_train, feature_y_valid = train_test_split(X_feat_eng, y, random_state=0)

feature_clf.fit(feature_X_train, feature_y_train, feature_model__verbose=False)

feature_preds = feature_clf.predict(feature_X_valid)

print('MAE:', mean_absolute_error(feature_y_valid, feature_preds))
#MAE: 15483.647185359589

In [None]:
X_final = X.copy()
X_final['years_since_update'] = X_final['YearRemodAdd'] - X_final['YearBuilt']
X_final['geometry'] = X_final['LotArea'] / X_final['LotFrontage']
X_final['land_topology'] = X_final['LandSlope'] + '_' + X_final['LandContour']

final_numerical_cols = [cname for cname in X_final.columns if 
                X_final[cname].dtype in ['int64', 'float64']]

final_categorical_cols = [cname for cname in X_final.columns if
                    X_final[cname].nunique() < 13 and 
                    X_final[cname].dtype == "object"]


final_numerical_transformer = SimpleImputer(strategy='constant')

final_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', final_numerical_transformer, final_numerical_cols),
        ('cat', final_categorical_transformer, final_categorical_cols)
])

final_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.0, gpu_id=1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.07, max_delta_step=0, max_depth=4,
             min_child_weight=0, monotone_constraints='1',
             n_estimators=1250, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=0)

final_clf = Pipeline(steps=[('final_preprocessor', final_preprocessor),
                      ('final_model', final_model)
                     ])

final_clf.fit(X_final, y, final_model__verbose=0)

In [None]:
X_test = pd.read_csv('/content/test hp.csv')

In [None]:
X_test['years_since_update'] = X_test['YearRemodAdd'] - X_test['YearBuilt']
X_test['geometry'] = X_test['LotArea'] / X_test['LotFrontage']
X_test['land_topology'] = X_test['LandSlope'] + '_' + X_test['LandContour']

In [None]:
preds = final_clf.predict(X_test)
output = pd.DataFrame({'Id': X_test.Id,
                       'SalePrice': preds})
output.to_csv('submission.csv', index=False)

# **Ans of 4**

Ensemble modeling is a powerful way to improve the performance of your model. It usually pays off to apply ensemble learning over and above various models you might be building. Time and again, people have used ensemble models in competitions like Kaggle and benefited from it.

Ensemble learning is a broad topic and is only confined by your own imagination. For the purpose of this article, I will cover the basic concepts and ideas of ensemble modeling. This should be enough for you to start building ensembles at your own end. As usual, we have tried to keep things as simple as possible.

The difference between Bagging and Boosting:

1) While they are built independently for Bagging, Boosting tries to add new models that do well where previous models fail.

2) Only Boosting determines weights for the data to tip the scales in favor of the most difficult cases.

3) It is an equally weighted average for Bagging and a weighted average for Boosting, more weight to those with better performance on training data.

4) Only Boosting tries to reduce bias. On the other hand, Bagging may solve the over-fitting problem, while Boosting can increase it.


