In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from mlxtend.regressor import StackingCVRegressor

### Main functions

In [None]:
def create_normalize_feature(df, feature, ordered_categories):
    weight = 1/(len(ordered_categories)-1)
    for i in range(len(ordered_categories)):
        df[feature].replace(ordered_categories[i], round(weight*(i),3), 
                            inplace= True)

In [None]:
def merge_categories_by_threshold(df, label, new_col_name, threshold):
    value_counts_label = df.value_counts(df[label])
    mask = (value_counts_label / value_counts_label.sum() * 100).lt(threshold)
    new_df = df.assign(new_label = np.where(df[label].isin(
                          value_counts_label[mask].index), 'Other', df[label]))
    new_df.rename(columns={'new_label': new_col_name}, inplace=True)
    new_df.drop(columns = label, inplace=True)
    return new_df

# Preprocess

In [None]:
def garage_features(df):
    df.loc[df['GarageType'].notnull(), 'GarageType'] = 1
    df.GarageType.fillna(0, inplace=True)
    df.rename(columns={'GarageType':'Garage'}, inplace=True)

def lot_frontage_feature(df):
    df.LotFrontage.fillna(df.LotFrontage.mean(), inplace=True)

def veneer_features(df):
    df.loc[df['MasVnrType'].notnull(), 'MasVnrType'] = 1
    df.MasVnrType.fillna(0, inplace=True)
    df['MasVnrType'][df.MasVnrArea == 0] = 0
    df.rename(columns={'MasVnrType':'MasVnr'}, inplace=True)
    df.MasVnrArea.fillna(0, inplace=True)

def fill_nan_categories(df):
    df.Functional.fillna('Typ', inplace=True)
    df.SaleType.fillna('WD', inplace=True)
    df.Exterior1st.fillna('VinylSd', inplace=True)
    df.Exterior2nd.fillna('VinylSd', inplace=True)

def electrical_feature(df):
    df.Electrical.fillna('SBrkr', inplace=True)
    ordered_categories = ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr']
    create_normalize_feature(df, 'Electrical', ordered_categories)

def exterior_feature(df):
    df.Exterior1st.fillna('None', inplace=True)
    df.Exterior2nd.fillna('None', inplace=True)
    df['Exterior_combined'] = df.apply(lambda x: x.Exterior1st + '_' + 
                                             x.Exterior2nd, axis=1)
    df.drop(columns=['Exterior1st', 'Exterior2nd'], inplace=True)

In [None]:
def fillna_continues_features(df, continues_features):
    for feature in continues_features:
        df.loc[np.isnan(df[feature]), feature] = 0

def encoding_ordinal_categories(df, categorial_dict):
    for feature, ordered_categories in categorial_dict.items():
        create_normalize_feature(df, feature, ordered_categories)
        df.loc[np.isnan(df[feature]), feature] = 0

def special_categories_treatment(df):
    fill_nan_categories(df)
    garage_features(df)
    lot_frontage_feature(df)
    veneer_features(df)
    electrical_feature(df)
    exterior_feature(df)

def remove_categorical_features(df, columns):
    df_columns = df.columns
    for feature in columns:
        if feature in df_columns:
            df.drop(columns = [feature], inplace=True)

def merge_categories(df, merged_features_dict, threshold):
    for feature, new_feature in merged_features_dict.items():
        df = merge_categories_by_threshold(df, feature, new_feature, threshold)
    return df

def unicode_features(df, unicode_dict):
    for feature, new_unicode_dict in unicode_dict.items():
        df[feature] = df[feature].map(new_unicode_dict) 

def create_onehot_features(df, onehot_columns):
  data = pd.get_dummies(df, prefix=onehot_columns, columns=onehot_columns)
  return data


In [None]:
continues_features = ['GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 
                      'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 
                      'GarageYrBlt']

category_ordered = {'PoolQC': ['Na', 'Fa', 'TA', 'Gd', 'Ex'],
                    'Fence': ['Na', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
                    'FireplaceQu': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'GarageFinish': ['Na', 'Unf', 'RFn', 'Fin'],
                    'GarageQual': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'GarageCond': ['Na', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'BsmtQual': ['NA','Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'BsmtCond': ['NA','Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'BsmtExposure': ['NA','No', 'Mn', 'Av', 'Gd'],
                    'BsmtFinType1': ['NA','Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
                    'BsmtFinType2': ['NA','Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
                    'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
                    'LotShape': ['IR3', 'IR2', 'IR1', 'Reg'],
                    'LandContour': ['Low', 'HLS', 'Bnk', 'Lvl'],
                    'LandSlope': ['Sev', 'Mod', 'Gtl'],
                    'BldgType': ['Twnhs', 'TwnhsE', 'Duplex', '2fmCon', '1Fam'],
                    'PavedDrive': ['N', 'P', 'Y'],
                    'Utilities': ['ELO', 'NoSeWa', 'NoSewr', 'AllPub']}
            
merged_features_dict = {'RoofMatl': 'RoofMatl_CompShg', 
                        'Heating': 'Heating_GasA',
                        'Functional': 'Functional_typical',
                        'Foundation': 'Foundation_merged',
                        'SaleCondition': 'SaleCondition_merged',
                        'Exterior_combined': 'Exterior_combined_merged',
                        'Condition2': 'Condition2_norm'}
                                            
unicode_features_dict = {'Street': {'Grvl': 1, 'Pave': 0},
                          'CentralAir': {'Y': 1, 'N': 0},
                          'RoofMatl_CompShg': {'CompShg': 1, 'Other': 0},
                          'Heating_GasA': {'GasA': 1, 'Other': 0},
                          'Functional_typical': {'Typ': 1, 'Other': 0},
                          'Condition2_norm': {'Norm': 1, 'Other': 0}}

onehot_columns = ['SaleCondition_merged']

remove_features_list = ['LotConfig', 'Condition1', 'MoSold', 'MiscVal',
                        'SaleCondition_merged_Other']

In [None]:
def preprocessing(df, continues_features, category_ordered, remove_features_list, 
                  merged_features_dict, unicode_features_dict, onehot_columns):
    fillna_continues_features(df, continues_features)
    encoding_ordinal_categories(df, category_ordered)
    special_categories_treatment(df)
    df = merge_categories(df, merged_features_dict, 5)
    unicode_features(df, unicode_features_dict)
    data = create_onehot_features(df, onehot_columns)
    remove_categorical_features(data, remove_features_list)
    data.set_index(['Id'], inplace=True)
    return data

## Changes in preprocess

We decided to find a better way to preprocess those features:
- Neighborhood 
- HouseStyle 
- RoofStyle 
- SaleType 
- Foundation
- Exterior

On the basic model we used "onehot encoding", it's created a massive number of new features because the features above has more then 2 categories. Now we decided to use "target encoding" which replaces a categorical value with the average value of the output (ie. target) for that value of the feature.
 * On 2 features: Foundation, Exterior we decided first to merge categories that are under 5% of the data because we saw from our EDA that they are not contibute a lot.
 * On the other features we decided not to merge categories under 5% because we saw that thier correlation with Y is pretty high.

We saw that it's improve our results so we add it to our preprocess and remove the original feature.

* The only category that we decided to use as onehot feature is "SaleCondition" that was better as onehot then as avg values. we merge the catefories that are less then 5% from our data and remove 1 onehot column because to represent all the categories is inufe to use num_categories-1 of the columns.

In [None]:
def target_encoding(df, features):
  group_by_avg = {}
  df_columns = df.columns
  for feature in features:
    feature_means = df.groupby(feature)['SalePrice'].mean()
    group_by_avg[feature] = feature_means
    df[feature + "_by_avg"] = df[feature].map(feature_means)
    df.drop(columns = [feature], inplace=True)
  return group_by_avg

In [None]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

processed_train = preprocessing(train, continues_features, category_ordered, 
                                remove_features_list, merged_features_dict, 
                                unicode_features_dict, onehot_columns)

features = ['Neighborhood', 'HouseStyle', 'RoofStyle', 'SaleType', 
            'Foundation_merged', 'Exterior_combined_merged']
                    
target_encoding(processed_train, features)

y = processed_train['SalePrice']
y = np.log1p(y)
X = processed_train.copy().loc[:, processed_train.columns != 'SalePrice']

In [None]:
#example:
processed_train['RoofStyle_by_avg'].value_counts()

171483.956179    1141
218876.933566     286
194690.000000      13
148909.090909      11
180568.428571       7
225000.000000       2
Name: RoofStyle_by_avg, dtype: int64

In [None]:
#basic model score
clf = LinearRegression()
scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print(np.mean(scores))

[0.89733983 0.87580673 0.90354124 0.89418906 0.78431095]
0.8710375626886726


# Feature Engineering

## New feature- distance from university

We found out thata the neighborhoods locations are around an university, so we decided to define a new feature that will represent the distance from the university because we saw it's influence on house price.

In [None]:
neighborhoods_dict = {'Blmngtn': 'Bloomington Heights, Ames, IA',
                      'Blueste': 'Bluestem, Ames, IA',
                      'BrDale': 'Briardale, Ames, IA',
                      'BrkSide': 'Brookside, Ames, IA',
                      'ClearCr': 'Clear Creek, Ames, IA',
                      'CollgCr': 'College Creek, Ames, IA',
                      'Crawfor': 'Crawford, Ames, IA',
                      'Edwards': 'Edwards, Ames, IA',
                      'Gilbert': 'Gilbert, IA',
                      'IDOTRR': 'Iowa Department of Transportation, Ames, IA',
                      'MeadowV': 'Meadow Place, Ames, IA',
                      'Mitchel': 'Mitchell, Ames, IA',
                      'NAmes': 'North Ames, Ames, IA',
                      'NoRidge': 'Northridge, Ames, IA',
                      'NPkVill': 'parkview, Ames, IA',
                      'NridgHt': 'Northridge Heights, Ames, IA',
                      'NWAmes': 'Northwest Ames, Ames, IA',
                      'OldTown': 'Old Town, Ames, IA',
                      'SWISU': 'Iowa State University, Ames, IA',
                      'Sawyer': 'Garfield Cir, Ames, IA',
                      'SawyerW': 'Illinois Ave, Ames, IA',
                      'Somerst': 'Somerset, Ames, IA',
                      'StoneBr': 'Stone Brooke, Ames, IA',
                      'Timber': 'Timberland, Ames, IA',
                      'Veenker': 'Veenker, Ames, IA'}

In [None]:

def find_geolocation(neighborhoods_dict):
  location = {}
  geolocator = Nominatim(user_agent='my_request')
  for key in neighborhoods_dict:
      loc = geolocator.geocode(neighborhoods_dict[key])
      location[key] = (loc.latitude, loc.longitude)
  return location

def find_distance_from_university():
  neighborhood_locations = find_geolocation(neighborhoods_dict)
  university = neighborhood_locations['SWISU']
  distance_from_uni = {}
  for key in neighborhood_locations:
      distance_from_uni[key] = geodesic(neighborhood_locations[key], university).km
  return distance_from_uni

In [None]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

distance_from_uni = find_distance_from_university()
train['dist_from_uni'] = train.Neighborhood.map(distance_from_uni)

processed_train = preprocessing(train, continues_features, category_ordered, 
                                remove_features_list, merged_features_dict, 
                                unicode_features_dict, onehot_columns)

target_encoding(processed_train, features)

y = processed_train['SalePrice']
y = np.log1p(y)
X = processed_train.copy().loc[:, processed_train.columns != 'SalePrice']

In [None]:
#basic model score
clf = LinearRegression()
scores = cross_val_score(clf, X, y, cv=5)
print(scores)
print(np.mean(scores))

[0.89698064 0.87699995 0.90219945 0.89508086 0.78523665]
0.8712995084305417


## Features Interaction
We want to emphasize the differences of components in houses, to do that we are using features interactions.

* **Quality * Size**- when we combine the quality with the area we might get some differences between components because the combination creates new "score" that combine the 2 parameters and helps to evaluation.

In [None]:
X_copy = X.copy()

#quality*size basement finish1
X_copy.loc[X_copy['BsmtFinSF1'] > 0, "BsmtFinSF1"] = (X_copy["BsmtFinSF1"] * 
                                                      X_copy["BsmtFinType1"])
#quality*size basement finish2 
X_copy.loc[X_copy['BsmtFinSF2'] > 0, "BsmtFinSF2"] = (X_copy["BsmtFinSF2"] *
                                                      X_copy["BsmtFinType2"])
#basement quality*size 
X_copy.loc[X_copy['TotalBsmtSF'] > 0, "TotalBsmtSF"] = (X_copy["TotalBsmtSF"] *
                                                        X_copy["BsmtCond"])

#vaneer quality*size 
X_copy.loc[X_copy['MasVnrArea'] > 0, "MasVnrArea"] = (X_copy["MasVnrArea"] *
                                                  X_copy["MasVnr"])

#Exterior quality*size
X_copy.loc[X_copy['ExterQual'] > 0, "ExterQual"] = (X_copy["ExterQual"] *
                                                   X_copy["ExterCond"])

#Garage quality*size
X_copy.loc[X_copy['GarageArea'] > 0, "GarageArea"] = (X_copy["GarageArea"] *
                                                   X_copy["GarageCond"])

#Fireplace quality*size
X_copy.loc[X_copy['Fireplaces'] > 0, "Fireplaces"] = (X_copy["Fireplaces"] *
                                                   X_copy["FireplaceQu"])

X_copy.rename(columns={'TotalBsmtSF':'BsmtSizeQual', 'ExterQual': 'Exterior'}, 
              inplace=True)

X_copy.drop(columns = ["BsmtFinType1", "BsmtFinType2", "BsmtCond",
                       "MasVnr", 'ExterCond', "FireplaceQu"], inplace=True)

* **Percent of total size**- when we show the percent of parameter from the hole we can see the bigger picture and compare propertly between diffrent records.

In [None]:
#percent finish1 basement
X_copy.loc[X_copy['BsmtSizeQual'] > 0, "BsmtFinSF1"] = (X_copy["BsmtFinSF1"]/
                                                        X_copy["BsmtSizeQual"])
#percent finish2 basement 
X_copy.loc[X_copy['BsmtSizeQual'] > 0, "BsmtFinSF2"] = (X_copy["BsmtFinSF2"]/
                                                        X_copy["BsmtSizeQual"])
#percent unfinished basement 
X_copy.loc[X_copy['BsmtSizeQual'] > 0, "BsmtUnfSF"] = (X_copy["BsmtUnfSF"]/
                                                       X_copy["BsmtSizeQual"])

#percent bedrooms above grade from all rooms above grade  
X_copy.loc[X_copy['TotRmsAbvGrd'] > 0, "BedroomAbvGr"] = (X_copy["BedroomAbvGr"]/
                                                       X_copy["TotRmsAbvGrd"])

#percent kitchens above grade from all rooms above grade  
X_copy.loc[X_copy['TotRmsAbvGrd'] > 0, "KitchenAbvGr"] = (X_copy["KitchenAbvGr"]/
                                                       X_copy["TotRmsAbvGrd"])

**Proportion by feature**- we tried to score the half bath in a diffrent way from the baths to united them to one feature that show the number of baths but more evaluation full baths then half baths 

In [None]:
X_copy["BsmtBaths"] = (X_copy["BsmtHalfBath"]*0.5) + X_copy["BsmtFullBath"]
X_copy.drop(columns = ["BsmtHalfBath", "BsmtFullBath"], inplace=True)

In [None]:
#basic model score
clf = LinearRegression()
scores = cross_val_score(clf, X_copy, y, cv=5)
print(scores)
print(np.mean(scores))

[0.8963146  0.88179761 0.90460539 0.90038839 0.79956168]
0.8765335333213002


## Normalization

### Years features

We want to normalize the data but first we need to deal with the various years categories. We do it by measuring the difference from the maximal year so we get a measure of how recent is this year. Then we can apply MinMax normalization as usual.


In [None]:
years_features = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
max_year = X_copy[years_features].max()
X_copy[years_features] = max_year - X_copy[years_features]

### MinMax normalization

In [None]:
scaler = MinMaxScaler()
features_to_normalize = ['OverallQual', 'OverallCond', "LotArea", "LotFrontage",
                         'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 
                         'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                         '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 
                         'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                         'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
                         'Neighborhood_by_avg', 'HouseStyle_by_avg', 
                         'BsmtSizeQual','RoofStyle_by_avg', 'SaleType_by_avg',
                         'Foundation_merged_by_avg', 'Exterior_combined_merged_by_avg']

scaler.fit(X_copy[features_to_normalize])
X_copy[features_to_normalize] = scaler.transform(X_copy[features_to_normalize])

In [None]:
#basic model score
clf = LinearRegression()
scores = cross_val_score(clf, X_copy, y, cv=5)
print(scores)
print(np.mean(scores))

[0.8963146  0.88179761 0.90460539 0.90038839 0.79956168]
0.8765335333218779


# Feature engineering A-Z

In [None]:
def find_geolocation(neighborhoods_dict):
  location = {}
  geolocator = Nominatim(user_agent='my_request')
  for key in neighborhoods_dict:
      loc = geolocator.geocode(neighborhoods_dict[key])
      location[key] = (loc.latitude, loc.longitude)
  return location

def find_distance_from_university(neighborhoods_dict):
  neighborhood_locations = find_geolocation(neighborhoods_dict)
  university = neighborhood_locations['SWISU']
  distance_from_uni = {}
  for key in neighborhood_locations:
      distance_from_uni[key] = geodesic(neighborhood_locations[key], 
                                        university).km
  return distance_from_uni

def quality_size_mult(df, features):
  for paired_features in features:
    size_feature = paired_features[0]
    quality_feature = paired_features[1]
    df.loc[df[size_feature] > 0, size_feature] = (df[size_feature] * 
                                                      df[quality_feature])
    
def size_from_total_divition(df, features):
  for paired_features in features:
    size_feature = paired_features[0]
    total_feature = paired_features[1]
    df.loc[df[total_feature] > 0, size_feature] = (df[size_feature]/
                                                        df[total_feature])
def rename_columns(df, names_dict):
  df.rename(columns= names_dict, inplace=True)

def remove_features(df, columns):
  df_columns = df.columns
  for feature in columns:
      if feature in df_columns:
          df.drop(columns = [feature], inplace=True)

def bsmt_bath(df):
  df["BsmtBaths"] = (df["BsmtHalfBath"]*0.5) + df["BsmtFullBath"]

def year_feature_by_max(df):
  years_features = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
  max_year = df[years_features].max()
  df[years_features] = max_year - df[years_features]

def target_encoding(df, features):
  group_by_avg = {}
  df_columns = df.columns
  for feature in features:
    feature_means = df.groupby(feature)['SalePrice'].mean()
    group_by_avg[feature] = feature_means
    df[feature + "_by_avg"] = df[feature].map(feature_means)
    df.drop(columns = [feature], inplace=True)
  return group_by_avg

In [None]:
def feature_engineering(df, neighborhoods_dict, qual_size_pairs,
                        size_total_pairs, remove_of_feature_engineering, 
                        names_dict):
  quality_size_mult(df, qual_size_pairs)
  rename_columns(df, names_dict)
  size_from_total_divition(df, size_total_pairs)
  bsmt_bath(df)
  remove_features(df, remove_of_feature_engineering)
  year_feature_by_max(df)

In [None]:
neighborhoods_dict = {'Blmngtn': 'Bloomington Heights, Ames, IA',
                      'Blueste': 'Bluestem, Ames, IA',
                      'BrDale': 'Briardale, Ames, IA',
                      'BrkSide': 'Brookside, Ames, IA',
                      'ClearCr': 'Clear Creek, Ames, IA',
                      'CollgCr': 'College Creek, Ames, IA',
                      'Crawfor': 'Crawford, Ames, IA',
                      'Edwards': 'Edwards, Ames, IA',
                      'Gilbert': 'Gilbert, IA',
                      'IDOTRR': 'Iowa Department of Transportation, Ames, IA',
                      'MeadowV': 'Meadow Place, Ames, IA',
                      'Mitchel': 'Mitchell, Ames, IA',
                      'NAmes': 'North Ames, Ames, IA',
                      'NoRidge': 'Northridge, Ames, IA',
                      'NPkVill': 'parkview, Ames, IA',
                      'NridgHt': 'Northridge Heights, Ames, IA',
                      'NWAmes': 'Northwest Ames, Ames, IA',
                      'OldTown': 'Old Town, Ames, IA',
                      'SWISU': 'Iowa State University, Ames, IA',
                      'Sawyer': 'Garfield Cir, Ames, IA',
                      'SawyerW': 'Illinois Ave, Ames, IA',
                      'Somerst': 'Somerset, Ames, IA',
                      'StoneBr': 'Stone Brooke, Ames, IA',
                      'Timber': 'Timberland, Ames, IA',
                      'Veenker': 'Veenker, Ames, IA'}

qual_size_pairs = [('BsmtFinSF1', "BsmtFinType1"), ('BsmtFinSF2', "BsmtFinSF2"), 
                   ("TotalBsmtSF","BsmtCond"), ("MasVnrArea", "MasVnr"), 
                   ("GarageArea", "GarageCond"), ("Fireplaces", "FireplaceQu")]

size_total_pairs = [("BsmtFinSF1", "BsmtSizeQual"), 
                    ("BsmtFinSF2", "BsmtSizeQual"),
                    ("BedroomAbvGr", "TotRmsAbvGrd"), 
                    ("KitchenAbvGr", "TotRmsAbvGrd")]
                    
remove_of_feature_engineering = ["BsmtFinType1", "BsmtFinType2", "BsmtCond",
                  "MasVnr", 'ExterCond', "FireplaceQu", "BsmtHalfBath",
                  "BsmtFullBath"]

names_dict = {'TotalBsmtSF':'BsmtSizeQual', 'ExterQual': 'Exterior'}

features_to_normalize = ['OverallQual', 'OverallCond', "LotArea", "LotFrontage",
                         'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 
                         'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                         '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 
                         'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
                         'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
                         'Neighborhood_by_avg', 'HouseStyle_by_avg', 
                         'BsmtSizeQual','RoofStyle_by_avg', 'SaleType_by_avg',
                         'Foundation_merged_by_avg', 'Exterior_combined_merged_by_avg']

target_encoding_features = ['Neighborhood', 'HouseStyle', 'RoofStyle', 'SaleType', 
                            'Foundation_merged', 'Exterior_combined_merged']

In [None]:
#train fit
train = pd.read_csv('train.csv')

distance_from_uni = find_distance_from_university(neighborhoods_dict)
train['dist_from_uni'] = train.Neighborhood.map(distance_from_uni)

processed_train = preprocessing(train, continues_features, category_ordered, 
                                remove_features_list, merged_features_dict, 
                                unicode_features_dict, onehot_columns)

target_encoding_dict = target_encoding(processed_train, target_encoding_features)

feature_engineering(processed_train, neighborhoods_dict, 
                                       qual_size_pairs, size_total_pairs, 
                                       remove_of_feature_engineering, names_dict)

y_train = processed_train['SalePrice']
y_train = np.log1p(y_train)
X_train = processed_train.copy().loc[:, processed_train.columns != 'SalePrice']

scaler = MinMaxScaler()
scaler.fit(X_train[features_to_normalize])
X_train[features_to_normalize] = scaler.transform(X_train[features_to_normalize])

In [None]:
#basic model score
clf = LinearRegression()
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[0.89888269 0.88190907 0.90615813 0.89892889 0.80120257]
0.8774162697059292


# Models evaluation

We check some models long the way. To find the best hyper parameters we used gridSearchCV. The models we tested:
- Ridge regression
- XGboost
- Random forest
- Gradient boost

In [None]:
clf = Ridge(random_state=0)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[0.90187645 0.8843094  0.90340694 0.89791707 0.80950845]
0.8794036631740383


In [None]:

clf = RandomForestRegressor(random_state=0)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[0.88315445 0.87105533 0.88000267 0.8803987  0.8536289 ]
0.8736480091522525


In [None]:

clf = XGBRegressor(objective ='reg:squarederror', random_state=0)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[0.91534426 0.89607287 0.8981973  0.9011251  0.884733  ]
0.8990945051697059


In [None]:

clf = GradientBoostingRegressor(random_state=0)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[0.91303509 0.88609917 0.89770736 0.9067837  0.88757866]
0.8982407957581428


# Improve models
We saw that the gradient boosting and the XGBoost are our best models and close with thier scores, so we decided to try to improve them both.

In [None]:
#train fit
train = pd.read_csv('train.csv')

distance_from_uni = find_distance_from_university(neighborhoods_dict)
train['dist_from_uni'] = train.Neighborhood.map(distance_from_uni)

processed_train = preprocessing(train, continues_features, category_ordered, 
                                remove_features_list, merged_features_dict, 
                                unicode_features_dict, onehot_columns)

target_encoding_dict = target_encoding(processed_train, target_encoding_features)

feature_engineering(processed_train, neighborhoods_dict, 
                                       qual_size_pairs, size_total_pairs, 
                                       remove_of_feature_engineering, names_dict)

y_train = processed_train['SalePrice']
y_train = np.log1p(y_train)
X_train = processed_train.copy().loc[:, processed_train.columns != 'SalePrice']

scaler = MinMaxScaler()
scaler.fit(X_train[features_to_normalize])
X_train[features_to_normalize] = scaler.transform(X_train[features_to_normalize])

In [None]:
#test fit
test = pd.read_csv('test.csv')

test['dist_from_uni'] = test.Neighborhood.map(distance_from_uni)

X_test = preprocessing(test, continues_features, category_ordered, 
                                remove_features_list, merged_features_dict, 
                                unicode_features_dict, onehot_columns)

for feature, feature_values in target_encoding_dict.items():
    X_test[feature + "_by_avg"] = X_test[feature].map(feature_values)
    X_test.drop(columns = [feature], inplace=True)

feature_engineering(X_test, neighborhoods_dict, qual_size_pairs, 
                    size_total_pairs, remove_of_feature_engineering, names_dict)

X_test[features_to_normalize] = scaler.transform(X_test[features_to_normalize])

### Multiply new features 
- We noticed that there are some feature engineering that contribute to those models specific so we decided to add them and normalize:
 
 $Lot = LotArea \cdot LotFrontage$

 $Pool = PoolArea \cdot PoolQC$

 $Overall = OverallQual \cdot OverallCond$

In [None]:
def multiply_new_features(df):
  df["Lot"] = (df["LotArea"] * df["LotFrontage"])
  df["Pool"] = (df["PoolArea"] * df["PoolQC"])
  df["Overall"] = (df["OverallQual"] * df["OverallCond"])
  df.drop(columns = ["LotArea", "LotFrontage", "PoolArea", "PoolQC", 
                        "OverallQual", "OverallCond"], inplace=True)

In [None]:
multiply_new_features(X_train)

scaler2 = MinMaxScaler()
normalize_list = ["Overall", "Lot", "Pool"]
scaler2.fit(X_train[normalize_list])
X_train[normalize_list] = scaler2.transform(X_train[normalize_list])

multiply_new_features(X_test)
X_test[normalize_list] = scaler2.transform(X_test[normalize_list])

### Drop more features
We decided to drop Utilities feature because it's gives no added information about the test data as all the test data has the same utility.

In [None]:
X_train.drop(columns=['Utilities', 'MSSubClass'], inplace=True)
X_test.drop(columns=[ 'Utilities', 'MSSubClass'], inplace=True)

# GridSearch - Note: This takes a long time to run!

In order to tune the hyperparameters of the models we chose, we decided to go with sklearn's GridSearchCV.
Of course, there are a lot of hyperparameters but we decided to focus on some of them:

* max_depth
* n_estimators
* learning_rate
* colsample_bytree / max_features
* subsample

We decided to first do a rough grid search and then fine-tune the search near the best values.

In [None]:
params_xgb = {'max_depth': [3,6,10],
                   'n_estimators': [100, 500, 1000],
                   'learning_rate': [0.01, 0.05, 0.1],
                   'colsample_bytree': [0.3, 0.7, 1.0],
              'subsample': [0.5, 0.7, 1.0]}

params_gb = {'max_depth': [3,6,10],
                   'n_estimators': [100, 500, 1000],
                   'learning_rate': [0.01, 0.05, 0.1],
                   'max_features': [0.3, 0.7, 1.0],
             'subsample': [0.5, 0.7, 1.0]}


In [None]:
xgb = XGBRegressor(objective ='reg:squarederror', random_state=0)
gb = GradientBoostingRegressor(random_state=0)

clf_gb = GridSearchCV(estimator=gb, param_grid=params_gb)
clf_xgb = GridSearchCV(estimator=xgb, param_grid=params_xgb, scoring='neg_mean_squared_error')

clf_xgb.fit(X_train, y_train)
clf_gb.fit(X_train, y_train)

print('Best parameters for XGBoost: ', clf_xgb.best_params_)
print('Best parameters for GradientBoosting: ', clf_gb.best_params_)

Best parameters for XGBoost:  {'colsample_bytree': 0.3, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
Best parameters for GradientBoosting:  {'learning_rate': 0.01, 'max_depth': 6, 'max_features': 0.3, 'n_estimators': 1000, 'subsample': 0.5}


## Fine-tuning the parameters

In [None]:
params_xgb = {'max_depth': [2,3,4],
                   'n_estimators': [400, 500, 600],
                   'learning_rate': [0.04, 0.05, 0.6],
                   'colsample_bytree': [0.2, 0.3, 0.4],
              'subsample': [0.6, 0.7, 0.8]}

params_gb = {'max_depth': [5,6,7],
                   'n_estimators': [900, 1000, 1100],
                   'learning_rate': [0.05, 0.1, 0.15],
                   'max_features': [0.2, 0.3, 0.4],
             'subsample': [0.4, 0.5, 0.6]}


In [None]:
xgb = XGBRegressor(objective ='reg:squarederror', random_state=0)
gb = GradientBoostingRegressor(random_state=0)

clf_gb = GridSearchCV(estimator=gb, param_grid=params_gb)
clf_xgb = GridSearchCV(estimator=xgb, param_grid=params_xgb, scoring='neg_mean_squared_error')

clf_xgb.fit(X_train, y_train)
clf_gb.fit(X_train, y_train)

print('Best parameters for XGBoost: ', clf_xgb.best_params_)
print('Best parameters for GradientBoosting: ', clf_gb.best_params_)

Best parameters for XGBoost:  {'colsample_bytree': 0.3, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.6}
Best parameters for GradientBoosting:  {'learning_rate': 0.05, 'max_depth': 5, 'max_features': 0.3, 'n_estimators': 900, 'subsample': 0.6}


So these are the final hyperparameters for our models

Best parameters for XGBoost:  {'colsample_bytree': 0.3, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.6}


Best parameters for GradientBoosting:  {'learning_rate': 0.05, 'max_depth': 5, 'max_features': 0.3, 'n_estimators': 900, 'subsample': 0.6}

In [None]:
clf = XGBRegressor(objective ='reg:squarederror', random_state=0, 
                   colsample_bytree= 0.3, learning_rate= 0.05, 
                   max_depth= 4, n_estimators= 500, subsample= 0.6)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[0.91939999 0.89756753 0.91275871 0.92082549 0.91456167]
0.9130226764471491


In [None]:
clf = GradientBoostingRegressor(random_state=0, learning_rate= 0.05, 
                                max_depth= 5, max_features= 0.3, 
                                n_estimators= 900, subsample= 0.6)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[0.9223826  0.90103631 0.90207787 0.92086175 0.91563298]
0.9123983009838776


# Final model predictions- stack model

After the grid we saw that we get almost the same results with XGBoost and Gradient Boosting. We decided to use our models with stack ensemble method, we did a basic fine-tuning to each model which is not our main model, and used our results from the grid search. 

The models for stacking:

In [40]:
from sklearn.ensemble import StackingRegressor

estimators = [('lr', LinearRegression(n_jobs = -1)),
              ('rd', Ridge(random_state=0, alpha = 4.84)),
              ('rf', RandomForestRegressor(random_state=0)),
              ('gb', GradientBoostingRegressor(random_state=0, learning_rate= 0.05, 
                               max_depth= 5, max_features= 0.3,
                               n_estimators= 900, subsample= 0.6)),
             ('xgb', XGBRegressor(objective ='reg:squarederror', random_state=0, 
                   colsample_bytree= 0.3, learning_rate= 0.05, 
                   max_depth= 4, n_estimators= 500, subsample= 0.6))]

model = StackingRegressor(estimators=estimators, final_estimator=Ridge(random_state=0, alpha = 4.84), cv=5)
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print(score)

0.9875979387935075


The score we got shows us that we are overfitting so we decided to remove:

- Random forest- because we learned that it's tend to overfitting
- XGBoost- because we suspec that with the gradient boosting they contribute to the overfitting of our model so we decided to stay with one of them
- On gradient boost we used the default params to avoid from overfitting.

In [41]:
estimators = [('lr', LinearRegression(n_jobs = -1)),
              ('rd', Ridge(random_state=0, alpha = 4.84)),
              ('gb', GradientBoostingRegressor(random_state=0))]

model = StackingRegressor(estimators=estimators, final_estimator=Ridge(random_state=0, alpha = 4.84), cv=5)
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print(score)

0.9435115209382899


In [42]:
y_pred = model.predict(X_test)
y_pred_final = np.expm1(y_pred)
submission = pd.DataFrame({'Id':test.Id, 'Predicted': y_pred_final})
submission.to_csv('submission.csv', index=False)