# Housing Feature Selection

### Load and Review Raw Data

In [1]:
# Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data sets
dfTrain = pd.read_csv("data/train.csv")
dfTest = pd.read_csv("data/test.csv")

# Get # of rows and columns of data set
print("Shape of training data set:", dfTrain.shape)
print("Shape of testing data set: ", dfTest.shape)

Shape of training data set: (1460, 81)
Shape of testing data set:  (1459, 80)


In [3]:
# Combine data sets for Data Analysis
dfBoth = pd.concat([dfTrain, dfTest], keys=['train', 'test'], names=['dataSet', 'index'])
print("Shape of combined data set:", dfBoth.shape)

Shape of combined data set: (2919, 81)


In [4]:
def fixNullValues(df):
    # Replace NA with 'None' or '0' where applicable based on variable
    nullIsNone = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure",  "PoolQC",
                  "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType",
                  "GarageFinish", "GarageQual", "GarageCond", "Fence", 
                  "MasVnrType", "MiscFeature"]
    for i in nullIsNone:
        df[i].fillna('None', inplace=True)
    
    nullIsZero = ["GarageArea", "GarageCars", "GarageYrBlt", "TotalBsmtSF", 
                  "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtHalfBath", 
                  "BsmtFullBath", "MasVnrArea", "SalePrice"]
    for i in nullIsZero:
        df[i].fillna(0, inplace=True)
        
    # Impute any remaining numeric Null values with the mean
    df.fillna(df.mean(numeric_only=True), inplace=True)
    
    # Impute any remaining Null values with the mode
    for column in df.columns:
        df[column].fillna(df[column].mode()[0], inplace=True)

In [5]:
# Clean data frame of null values
fixNullValues(dfBoth)

In [6]:
# Get a sorted list of the numbers of Null values
missingVals = dfBoth.isnull().sum()
missingVals = missingVals[missingVals > 0]
missingVals.sort_values()

Series([], dtype: int64)

In [7]:
# Create a list of the categorical variables
catColList = dfBoth.select_dtypes(include=['object']).columns.tolist()

### Encode Ordinal Categorical Variables

In [8]:
# Load libraries
from sklearn.preprocessing import OrdinalEncoder

In [9]:
# Set up lists of variables and order of their categories
lotRegVar = ['LotShape']
lotRegCat = ['None', 'Reg', 'IR1', 'IR2', 'IR3']

slopeVar = ['LandSlope']
slopeCat = ['None', 'Gtl', 'Mod', 'Sev']

qualVar = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
           'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
qualCat = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']

bsmExpVar = ['BsmtExposure']
bsmExpCat = ['None', 'No', 'Mn', 'Av', 'Gd']

bsmFinVar = ['BsmtFinType1', 'BsmtFinType2']
bsmFinCat = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']

funcVar = ['Functional']
funcCat = ['None', 'Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']

garFinVar = ['GarageFinish']
garFinCat = ['None', 'Unf', 'RFn', 'Fin']

pavDrvVar = ['PavedDrive']
pavDrvCat = ['None', 'N', 'P', 'Y']

In [10]:
# Encode Lot Irregularity
lotRegEnc = OrdinalEncoder(categories=[lotRegCat])
for i in lotRegVar:
    dfBoth[i] = lotRegEnc.fit_transform(dfBoth[[i]])

# Encode Slope of Lot
slopeEnc = OrdinalEncoder(categories=[slopeCat])
for i in slopeVar:
    dfBoth[i] = slopeEnc.fit_transform(dfBoth[[i]])

# Encode Quality
qualEnc = OrdinalEncoder(categories=[qualCat])
for i in qualVar:
    dfBoth[i] = qualEnc.fit_transform(dfBoth[[i]])

# Encode Basement Exposure
bsmExpEnc = OrdinalEncoder(categories=[bsmExpCat])
for i in bsmExpVar:
    dfBoth[i] = bsmExpEnc.fit_transform(dfBoth[[i]])

# Encode Basement Finish
bsmFinEnc = OrdinalEncoder(categories=[bsmFinCat])
for i in bsmFinVar:
    dfBoth[i] = bsmFinEnc.fit_transform(dfBoth[[i]])

# Encode Home Functionality
funcEnc = OrdinalEncoder(categories=[funcCat])
for i in funcVar:
    dfBoth[i] = funcEnc.fit_transform(dfBoth[[i]])

# Encode Garage Finish
garFinEnc = OrdinalEncoder(categories=[garFinCat])
for i in garFinVar:
    dfBoth[i] = garFinEnc.fit_transform(dfBoth[[i]])

# Encode Driveway Surface
pavDrvEnc = OrdinalEncoder(categories=[pavDrvCat])
for i in pavDrvVar:
    dfBoth[i] = pavDrvEnc.fit_transform(dfBoth[[i]])

In [11]:
## removing test set from data since it does not contain the saleprice and we need to look at categorical values
dfTrainC = dfBoth.loc['train']
# dfTrainC.to_csv('data/dfTrainC.csv',index=False)

### Create DF with Dummy Variables

In [12]:
def getDummies(df):
    '''Creates dummy variables from categorical variables, adds them
    to the dataframe and returns the new dataframe with dummy vars'''
    dummyDF = df.copy()
    catList = dummyDF.select_dtypes(include=['object']).columns.tolist()
    for i in catList:
        dummies = pd.get_dummies(dummyDF[i], prefix=i)
        dummyDF = pd.concat([dummyDF, dummies], axis=1)
    return dummyDF

In [13]:
# Add dummy variable columns to dataframes
dfBoth_dummies = getDummies(dfBoth)
dfTrainC_dummies = getDummies(dfTrainC)

### Data Analysis

In [14]:
# Separate data so as to only analyze training data, which contains SalePrice
dfAnalysis = dfBoth_dummies.loc['train'].copy()
dfAnalysis['SalePrice'] = pd.to_numeric(dfAnalysis['SalePrice'])
# Save file
# dfAnalysis.to_csv('data/dfAnalysis.csv',index=False)

In [15]:
# Move 'SalePrice' column to front
salePrice = dfAnalysis['SalePrice']
dfAnalysis.drop(labels=['SalePrice'], axis=1, inplace=True)
dfAnalysis.insert(0, 'SalePrice', salePrice)

#### Get a list of best variables using Feature Selection

In [16]:
# Load libraries
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Split data for feature selection:

In [17]:
dfAnalysis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Columns: 257 entries, SalePrice to SaleCondition_Partial
dtypes: float64(30), int64(26), object(25), uint8(176)
memory usage: 1.2+ MB


In [18]:
numDF = dfAnalysis.select_dtypes(exclude=['object'])

In [19]:
numDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Columns: 232 entries, SalePrice to SaleCondition_Partial
dtypes: float64(30), int64(26), uint8(176)
memory usage: 901.1 KB


In [20]:
# Set up features target sets
X = numDF[numDF.columns[1:]]
y = numDF.SalePrice

# Encode the target variables
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, 
                                                    random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test  = scaler.transform(X_test)

#### Feature Selection

In [21]:
%%time
# Feature selection using F-statistic
from sklearn.feature_selection import f_classif
f = f_classif(X_train, y_train)[0]

Wall time: 305 ms


  f = msb / msw
  f = msb / msw


In [22]:
%%time
# Feature selection using mutual information
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X_train, y_train)

Wall time: 18.1 s


In [23]:
%%time
# Feature selection using logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=500).fit(X_train, y_train)

Wall time: 15.9 s


In [24]:
%%time
# Feature selection using light gbm
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(
    objective = 'multiclass',
    metric = 'multi_logloss',
    importance_type = 'gain'
).fit(X_train, y_train)

Wall time: 35 s


Store Results

In [25]:
%%time
# Create DF to store feature ranking info
ranking = pd.DataFrame(index = range(X_train.shape[1]))

# Store ranking info for each feature from each method
ranking['feat'] = X.columns
ranking['f'] = pd.Series(f, index = ranking.index).fillna(0).rank(ascending = False)
ranking['mi'] = pd.Series(mi, index = ranking.index).fillna(0).rank(ascending = False)
ranking['logreg'] = pd.Series(np.abs(logreg.coef_).mean(axis = 0), index = ranking.index).rank(ascending = False)
ranking['lightgbm'] = pd.Series(lgbm.feature_importances_, index = ranking.index).rank(ascending = False)
# ranking['mrmr'] = pd.Series(list(range(1, len(mrmr) + 1)) + [X_train.shape[1]] * (X_train.shape[1] - len(mrmr)), index = mrmr + list(set(ranking.index) - set(mrmr))).sort_index()

# ranking = ranking.replace(to_replace = ranking.max(), value = X_train.shape[1]) # Not sure why this was in here
ranking.to_csv('data/ranking.csv', index = False)

Wall time: 33.9 ms


In [26]:
ranking.sort_values('logreg')

Unnamed: 0,feat,f,mi,logreg,lightgbm
0,Id,137.0,229.0,1.0,1.0
54,YrSold,96.0,214.0,2.0,21.0
53,MoSold,121.0,224.0,3.0,14.0
4,LotShape,48.0,192.0,4.0,40.0
16,BsmtFinType1,62.0,211.0,5.0,28.0
...,...,...,...,...,...
197,Electrical_Mix,229.0,229.0,229.0,186.0
113,Condition2_PosA,229.0,229.0,229.0,186.0
152,Exterior1st_ImStucc,229.0,229.0,229.0,186.0
117,Condition2_RRNn,229.0,225.0,229.0,186.0


Compare

In [27]:
# Sum the rankings across methods
ranking['total'] = ranking.sum(axis=1)
# Sort by sum to get an overall idea of the most/least useful features
ranking.sort_values('total')

  ranking['total'] = ranking.sum(axis=1)


Unnamed: 0,feat,f,mi,logreg,lightgbm,total
98,Neighborhood_StoneBr,30.0,18.0,120.0,27.0,195.0
66,LandContour_HLS,23.0,13.0,112.0,50.0,198.0
91,Neighborhood_NoRidge,9.0,73.0,101.0,41.0,224.0
222,SaleType_New,13.0,55.0,63.0,95.0,226.0
92,Neighborhood_NridgHt,10.0,90.0,67.0,63.0,230.0
...,...,...,...,...,...,...
117,Condition2_RRNn,229.0,225.0,229.0,186.0,869.0
139,RoofMatl_Membran,229.0,226.0,229.0,186.0,870.0
113,Condition2_PosA,229.0,229.0,229.0,186.0,873.0
197,Electrical_Mix,229.0,229.0,229.0,186.0,873.0


In [28]:
ranked = ranking.sort_values('total')
ranked.head(50)

Unnamed: 0,feat,f,mi,logreg,lightgbm,total
98,Neighborhood_StoneBr,30.0,18.0,120.0,27.0,195.0
66,LandContour_HLS,23.0,13.0,112.0,50.0,198.0
91,Neighborhood_NoRidge,9.0,73.0,101.0,41.0,224.0
222,SaleType_New,13.0,55.0,63.0,95.0,226.0
92,Neighborhood_NridgHt,10.0,90.0,67.0,63.0,230.0
81,Neighborhood_CollgCr,45.0,144.0,10.0,31.0,230.0
10,MasVnrArea,27.0,202.0,23.0,10.0,262.0
41,GarageArea,22.0,212.0,27.0,6.0,267.0
84,Neighborhood_Gilbert,122.0,50.0,59.0,39.0,270.0
6,OverallQual,8.0,204.0,37.0,22.0,271.0


In [29]:
# ranked.feat[:20]

#### Save Data Frames with the top ~~30(?)~~ 20 features by each method

In [30]:
# Set the number of features to keep
numFeat = 20

In [31]:
## F-Statistic
# Get a list of the top features
fStat = ranking.sort_values('f')
fStat = fStat.feat[:numFeat].tolist()
# Generate a DF of just those features
fStatDF = numDF[fStat]
# Add target variable to DF
fStatDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
fStatDF.to_csv('data/featFstatistic.csv', index=False)

In [32]:
## Mutual Information
# Get a list of the top features
miFeat = ranking.sort_values('mi')
miFeat = miFeat.feat[:numFeat].tolist()
# Generate a DF of just those features
miFeatDF = numDF[miFeat]
# Add target variable to DF
miFeatDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
miFeatDF.to_csv('data/featMutualInformation.csv', index=False)

In [33]:
## Logistic Regression
# Get a list of the top features
lrFeat = ranking.sort_values('logreg')
lrFeat = lrFeat.feat[:numFeat].tolist()
# Generate a DF of just those features
lrFeatDF = numDF[lrFeat]
# Add target variable to DF
lrFeatDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
lrFeatDF.to_csv('data/featLogisticRegression.csv', index=False)

In [34]:
## Logistic Regression
# Get a list of the top features
featLBGM = ranking.sort_values('lightgbm')
featLBGM = featLBGM.feat[:numFeat].tolist()
# Generate a DF of just those features
featLBGMDF = numDF[featLBGM]
# Add target variable to DF
featLBGMDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
featLBGMDF.to_csv('data/featLightBGM.csv', index=False)

In [35]:
## Combined Overall
# Get a list of the top features
overall = ranking.sort_values('total')
overall = overall.feat[:numFeat].tolist()
# Generate a DF of just those features
overallDF = numDF[overall]
# Add target variable to DF
overallDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
overallDF.to_csv('data/featOverall.csv', index=False)