# <u>**Kaggle Competitions: House Price - Feature Engineering**</u>

In [1]:
def feature_engineering(df):

    # domain-based replacement - Alley Feature
    df['Alley'] = df['Alley'].replace(np.nan, 'NoAlley')

    # domain-based replacement - FirePlace Feature
    df['FireplaceQu'] = df['FireplaceQu'].replace(np.nan, 'NoFirePlace')

    # domain-based replacement - Garage Feature
    df['GarageType'] = df['GarageType'].replace(np.nan, 'NoGarage')
    df['GarageFinish'] = df['GarageFinish'].replace(np.nan, 'NoGarage')
    df['GarageQual'] = df['GarageQual'].replace(np.nan, 'NoGarage')
    df['GarageCond'] = df['GarageCond'].replace(np.nan, 'NoGarage')

    # domain-based replacement - Pool Feature
    df['PoolQC'] = df['PoolQC'].replace(np.nan, 'NoPool')

    # domain-based replacement - Fence Feature
    df['Fence'] = df['Fence'].replace(np.nan, 'NoFence')

    # domain-based replacement - Feature
    df['MiscFeature'] = df['MiscFeature'].replace(np.nan, 'NoFeature')

    # domain-based fill missing value - GarageYrBlt
    # if there is no garage, fill the nan with the year when the house was built
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])

    # Style_Type: Combining BldgType & HouseStyle
    # one is type of dwelling and the other is style
    # each show little statistic significance, and combining p_value is greater
    df['Style_Type'] = df['BldgType'] + '_' + df['HouseStyle']

    # Roof_Style_Mat: Combining RoofStyle & RoofMatl
    # they are all about roof, thus combining together
    # which include not only style but material used
    df['Roof_Style_Mat'] = df['RoofStyle'] + '_' + df['RoofMatl']

    # is_MasVnr: check if there is any masonry veneer area in the house
    # MasVnrArea has lots of 0 value (meaning no masonry veneer)
    df['is_MasVnr'] = (df['MasVnrArea'] != 0).astype(int)

    # Year_Month_Sold: Combining YrSold + MoSold
    # since both MoSold and YrSold do not show much significance
    df['Year_Month_Sold'] = str(df['YrSold']) + '-' + str(df['MoSold'])

    # Season_Sold: Dividing MoSold into 4 seasons
    # ames, Iowa four seasons
    winter = [12, 1, 2]
    spring = [3, 4, 5]
    summer = [6, 7, 8]
    fall = [9, 10, 11]
    def season(row):
        if row['MoSold'] in winter:
            return 'winter'
        elif row['MoSold'] in spring:
            return 'spring'
        elif row['MoSold'] in summer:
            return 'summer'
        else:
            return 'fall'
    df['Season_Sold'] = df.apply(season, axis = 1)

    # TotalFlrSF: Combining 1stFlrSF & 2ndFlrSF
    df['TotalFlrSF'] = df['1stFlrSF'] + df['2ndFlrSF']

    # Total_Porch_Area: Combining OpenPorchSF & EnclosedPorch &
    # 3SsnPorch & ScreenPorch
    df['Total_Porch_Area'] = df['OpenPorchSF'] + df['EnclosedPorch'] + \
                             df['3SsnPorch'] + df['ScreenPorch']
    
    # Year_Avg: Combining YearBuilt & YearRemodAdd
    # take the average of those 2
    df['Year_Avg'] = (df['YearBuilt'] + df['YearRemodAdd']) / 2

    # ExteriorSame: Checking if there is only one exterior material
    df['ExteriorSame'] = (df['Exterior1st'] == df['Exterior2nd']).astype(int)

    # is_LowQualFinSF: check if there is any low quality area in the house
    df['is_LowQualFinSF'] = (df['LowQualFinSF'] > 0).astype(int)

    # Has_Open_Porch: check if there is any open porch or not
    df['Has_Open_Porch'] = (df['OpenPorchSF'] > 0).astype(int)

    # Has_Wooden_Deck: check if there is any wooden deck
    df['Has_Wooden_Deck'] = (df['WoodDeckSF'] > 0).astype(int)

    # Has_Enclose_Porch: check if there is any enclose porch
    df['Has_Enclose_Porch'] = (df['EnclosedPorch'] > 0).astype(int)

    # Has_3Sn_Porch: check if there is any 3 seasons porch
    df['Has_3Sn_Porch'] = (df['3SsnPorch'] > 0).astype(int)

    # Has_Screen_Porch: check if there is any screen porch
    df['Has_Screen_Porch'] = (df['ScreenPorch'] > 0).astype(int)

    # Has_Pool: check if there is any pool
    df['Has_Pool'] = (df['PoolArea'] > 0).astype(int)

    return df