# Exercise

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

# Set Matplotlib defaults
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

default_params = {
    'iterations': 1000, 
    'learning_rate': 0.05, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'verbose': 100, 
    'random_seed': 42,
    'early_stopping_rounds': 100
}

In [8]:
# Load data
df = pd.read_csv("data/ames.csv")

In [25]:
def score_dataset(X, y, model=CatBoostRegressor(**default_params)):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    return score

In [10]:
# Prepare data
X = df.copy()
y = X.pop("SalePrice")

In [11]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YearSold,SaleType,SaleCondition
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,0.0,0.0,No_Pool,No_Fence,,0.0,5,2010,WD,Normal
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,120.0,0.0,No_Pool,Minimum_Privacy,,0.0,6,2010,WD,Normal
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,0.0,0.0,No_Pool,No_Fence,Gar2,12500.0,6,2010,WD,Normal
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,0.0,0.0,No_Pool,No_Fence,,0.0,4,2010,WD,Normal
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,0.0,0.0,No_Pool,Minimum_Privacy,,0.0,3,2010,WD,Normal


Let's start with a few mathematical combinations. We'll focus on features describing areas -- having the same units (square-feet) makes it easy to combine them in sensible ways. Since we're using XGBoost (a tree-based model), we'll focus on ratios and sums.

1) Create Mathematical Transforms
Create the following features:

- LivLotRatio: the ratio of GrLivArea to LotArea
- Spaciousness: the sum of FirstFlrSF and SecondFlrSF divided by TotRmsAbvGrd
- TotalOutsideSF: the sum of WoodDeckSF, OpenPorchSF, EnclosedPorch, Threeseasonporch, and ScreenPorch

In [13]:
X_1 = pd.DataFrame()  # dataframe to hold new features

X_1["LivLotRatio"] = X["GrLivArea"] / X["LotArea"]
X_1["Spaciousness"] = (X['FirstFlrSF'] + X['SecondFlrSF']) / X['TotRmsAbvGrd']
X_1["TotalOutsideSF"] = X["WoodDeckSF"] + X["OpenPorchSF"] + X["EnclosedPorch"] + X["Threeseasonporch"] + X["ScreenPorch"]

If you've discovered an interaction effect between a numeric feature and a categorical feature, you might want to model it explicitly using a one-hot encoding, like so:

##### One-hot encode Categorical feature, adding a column prefix "Cat"
X_new = pd.get_dummies(df.Categorical, prefix="Cat")

##### Multiply row-by-row
X_new = X_new.mul(df.Continuous, axis=0)

##### Join the new features to the feature set
X = X.join(X_new)


2) Interaction with a Categorical
We discovered an interaction between BldgType and GrLivArea in Exercise 2. Now create their interaction features.


In [14]:
# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(X["BldgType"], prefix="Bldg")
# Multiply
X_2 = X_2.mul(X["GrLivArea"], axis=0)


3) Count Feature
Let's try creating a feature that describes how many kinds of outdoor areas a dwelling has. Create a feature PorchTypes that counts how many of the following are greater than 0.0:

WoodDeckSF
OpenPorchSF
EnclosedPorch
Threeseasonporch
ScreenPorch

In [16]:
X_3 = pd.DataFrame()
X_3["PorchTypes"] = X[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "Threeseasonporch", "ScreenPorch"]].gt(0).sum(axis=1)
X_3.head()

Unnamed: 0,PorchTypes
0,2
1,2
2,2
3,0
4,2


4) Break Down a Categorical Feature
MSSubClass describes the type of a dwelling:

In [17]:
df.MSSubClass.unique()

array(['One_Story_1946_and_Newer_All_Styles', 'Two_Story_1946_and_Newer',
       'One_Story_PUD_1946_and_Newer',
       'One_and_Half_Story_Finished_All_Ages', 'Split_Foyer',
       'Two_Story_PUD_1946_and_Newer', 'Split_or_Multilevel',
       'One_Story_1945_and_Older', 'Duplex_All_Styles_and_Ages',
       'Two_Family_conversion_All_Styles_and_Ages',
       'One_and_Half_Story_Unfinished_All_Ages',
       'Two_Story_1945_and_Older', 'Two_and_Half_Story_All_Ages',
       'One_Story_with_Finished_Attic_All_Ages',
       'PUD_Multilevel_Split_Level_Foyer',
       'One_and_Half_Story_PUD_All_Ages'], dtype=object)

In [20]:
# You can see that there is a more general categorization described (roughly) by the first word of each category. Create a feature containing only these first words by splitting MSSubClass at the first underscore _. (Hint: In the split method use an argument n=1.)

X_4 = pd.DataFrame()
X_4["MSClass"] = X["MSSubClass"].str.split("_", n=1).str[0]
X_4.head()

Unnamed: 0,MSClass
0,One
1,One
2,One
3,One
4,Two


5) Use a Grouped Transform
The value of a home often depends on how it compares to typical homes in its neighborhood. Create a feature MedNhbdArea that describes the median of GrLivArea grouped on Neighborhood.

In [23]:
X_5 = pd.DataFrame()

X_5["MedNhbdArea"] = X.groupby("Neighborhood")["GrLivArea"].transform("median")
X_5.head()

Unnamed: 0,MedNhbdArea
0,1200.0
1,1200.0
2,1200.0
3,1200.0
4,1560.0


In [26]:
X_new = X.join([X_1, X_2, X_3, X_4, X_5])
score_dataset(X_new, y)

0:	learn: 74994.0385704	total: 2.82ms	remaining: 2.82s
100:	learn: 19732.7030204	total: 125ms	remaining: 1.11s
200:	learn: 15683.3959519	total: 243ms	remaining: 967ms
300:	learn: 13066.7687051	total: 364ms	remaining: 846ms
400:	learn: 11447.5085376	total: 502ms	remaining: 750ms
500:	learn: 10075.7080447	total: 791ms	remaining: 788ms
600:	learn: 8939.7215065	total: 1.04s	remaining: 693ms
700:	learn: 8066.4296868	total: 1.18s	remaining: 505ms
800:	learn: 7304.0864564	total: 1.46s	remaining: 362ms
900:	learn: 6680.0261657	total: 1.66s	remaining: 182ms
999:	learn: 6133.5002918	total: 1.78s	remaining: 0us
0:	learn: 78362.8880376	total: 1.53ms	remaining: 1.53s
100:	learn: 20399.3936616	total: 175ms	remaining: 1.56s
200:	learn: 16335.7215275	total: 302ms	remaining: 1.2s
300:	learn: 13839.4206924	total: 451ms	remaining: 1.05s
400:	learn: 12099.7829495	total: 773ms	remaining: 1.15s
500:	learn: 10746.6217367	total: 980ms	remaining: 976ms
600:	learn: 9656.2226838	total: 1.12s	remaining: 746ms
700

0.015127231910563893