In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [7]:
def apply_pca(X, standardize = True):
    if standardize:
        X = (X - X.mean(axis = 0)) / X.std(axis = 0)
    pca = PCA()
    X_pca = pca.fit_transform(X)
    compo_name = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    
    loading = pd.DataFrame(
        pca.components_.T,
        columns=compo_name,
        index=X.columns,
    )
    return pca,X_pca,loading
def make_mi_score(X,y):
    X = X.copy()
    for cols in X.select_dtypes(["object", "category"]):
        X[cols], _ = X[cols].factorize()
    disc_feature = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_score = mutual_info_regression(X,y,discrete_features=disc_feature,random_state=0)
    mi_score = pd.Series(mi_score,name= "MI socres", index = X.columns)
    mi_score = mi_score.sort_values(ascending=False)
    return mi_score

def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


In [8]:
df = pd.read_csv("./ames.csv")
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,FirstFlrSF,SecondFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,Threeseasonporch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YearSold,SaleType,SaleCondition,SalePrice
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,Gtl,North_Ames,Norm,Norm,OneFam,One_Story,Above_Average,Average,1960,1960,Hip,CompShg,BrkFace,Plywood,Stone,112.0,Typical,Typical,CBlock,Typical,Good,Gd,BLQ,2.0,Unf,0.0,441.0,1080.0,GasA,Fair,Y,SBrkr,1656.0,0.0,0.0,1656.0,1,0,1,0,3,1,Typical,7,Typ,2,Good,Attchd,Fin,2,528.0,Typical,Typical,Partial_Pavement,210.0,62.0,0.0,0.0,0.0,0.0,No_Pool,No_Fence,,0.0,5,2010,WD,Normal,215000
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,Gtl,North_Ames,Feedr,Norm,OneFam,One_Story,Average,Above_Average,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,Typical,Typical,CBlock,Typical,Typical,No,Rec,6.0,LwQ,144.0,270.0,882.0,GasA,Typical,Y,SBrkr,896.0,0.0,0.0,896.0,0,0,1,0,2,1,Typical,5,Typ,0,No_Fireplace,Attchd,Unf,1,730.0,Typical,Typical,Paved,140.0,0.0,0.0,0.0,120.0,0.0,No_Pool,Minimum_Privacy,,0.0,6,2010,WD,Normal,105000
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,Gtl,North_Ames,Norm,Norm,OneFam,One_Story,Above_Average,Above_Average,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,Typical,Typical,CBlock,Typical,Typical,No,ALQ,1.0,Unf,0.0,406.0,1329.0,GasA,Typical,Y,SBrkr,1329.0,0.0,0.0,1329.0,0,0,1,1,3,1,Good,6,Typ,0,No_Fireplace,Attchd,Unf,1,312.0,Typical,Typical,Paved,393.0,36.0,0.0,0.0,0.0,0.0,No_Pool,No_Fence,Gar2,12500.0,6,2010,WD,Normal,172000
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,Gtl,North_Ames,Norm,Norm,OneFam,One_Story,Good,Average,1968,1968,Hip,CompShg,BrkFace,BrkFace,,0.0,Good,Typical,CBlock,Typical,Typical,No,ALQ,1.0,Unf,0.0,1045.0,2110.0,GasA,Excellent,Y,SBrkr,2110.0,0.0,0.0,2110.0,1,0,2,1,3,1,Excellent,8,Typ,2,Typical,Attchd,Fin,2,522.0,Typical,Typical,Paved,0.0,0.0,0.0,0.0,0.0,0.0,No_Pool,No_Fence,,0.0,4,2010,WD,Normal,244000
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,OneFam,Two_Story,Average,Average,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,Typical,Typical,PConc,Good,Typical,No,GLQ,3.0,Unf,0.0,137.0,928.0,GasA,Good,Y,SBrkr,928.0,701.0,0.0,1629.0,0,0,2,1,3,1,Typical,6,Typ,1,Typical,Attchd,Fin,2,482.0,Typical,Typical,Paved,212.0,34.0,0.0,0.0,0.0,0.0,No_Pool,Minimum_Privacy,,0.0,3,2010,WD,Normal,189900


In [9]:
features = [
    "GarageArea",
    "YearRemodAdd",
    "TotalBsmtSF",
    "GrLivArea",
]

print("Correlation with SalePrice:\n")
print(df[features].corrwith(df.SalePrice))

Correlation with SalePrice:

GarageArea      0.640138
YearRemodAdd    0.532974
TotalBsmtSF     0.632529
GrLivArea       0.706780
dtype: float64


In [10]:
X = df.copy()
y = X.pop("SalePrice")
X = X.loc[:, features]

# `apply_pca`, defined above, reproduces the code from the tutorial
pca, X_pca, loadings = apply_pca(X)
print(loadings)

                   PC1       PC2       PC3       PC4
GarageArea    0.541229 -0.102375 -0.038470  0.833733
YearRemodAdd  0.427077  0.886612 -0.049062 -0.170639
TotalBsmtSF   0.510076 -0.360778 -0.666836 -0.406192
GrLivArea     0.514294 -0.270700  0.742592 -0.332837


In [13]:
X = df.copy()
y = X.pop("SalePrice")

# YOUR CODE HERE: Add new features to X.
X["Feature1"] = X.GrLivArea + X.TotalBsmtSF
X["Feature2"] = X.YearRemodAdd * X.TotalBsmtSF

score = score_dataset(X, y)
print(f"Your score: {score:.5f} RMSLE")

Your score: 0.13792 RMSLE


In [18]:
X = df.copy()
y = X.pop("SalePrice")

score = make_mi_score(X,y)
print(f" socre : {score}")

 socre : OverallQual         0.581262
Neighborhood        0.569813
GrLivArea           0.496909
YearBuilt           0.437939
GarageArea          0.415014
                      ...   
Utilities           0.000291
Threeseasonporch    0.000000
BsmtFinSF2          0.000000
MoSold              0.000000
LandSlope           0.000000
Name: MI socres, Length: 78, dtype: float64
