# Predicting Housing Prices
**Authors:** Scott Breitbach, Pushkar Chougule, & Rachel Nelson  
**Class:** DSC630-T301 Predictive Analytics (2221-1)

### Load and Review Raw Data

In [1]:
# Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data sets
dfTrain = pd.read_csv("train.csv")
dfTest = pd.read_csv("test.csv")

# Get # of rows and columns of data set
print("Shape of training data set:", dfTrain.shape)
print("Shape of testing data set: ", dfTest.shape)

Shape of training data set: (1460, 81)
Shape of testing data set:  (1459, 80)


In [3]:
# Combine data sets for Data Analysis
dfBoth = pd.concat([dfTrain, dfTest], keys=['train', 'test'], names=['dataSet', 'index'])
print("Shape of combined data set:", dfBoth.shape)

Shape of combined data set: (2919, 81)


In [4]:
def fixNullValues(df):
    # Replace NA with 'None' or '0' where applicable based on variable
    nullIsNone = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure",  "PoolQC",
                  "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType",
                  "GarageFinish", "GarageQual", "GarageCond", "Fence", 
                  "MasVnrType", "MiscFeature"]
    for i in nullIsNone:
        df[i].fillna('None', inplace=True)
    
    nullIsZero = ["GarageArea", "GarageCars", "GarageYrBlt", "TotalBsmtSF", 
                  "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtHalfBath", 
                  "BsmtFullBath", "MasVnrArea", "SalePrice"]
    for i in nullIsZero:
        df[i].fillna(0, inplace=True)
        
    # Impute any remaining numeric Null values with the mean
    df.fillna(df.mean(numeric_only=True), inplace=True)
    
    # Impute any remaining Null values with the mode
    for column in df.columns:
        df[column].fillna(df[column].mode()[0], inplace=True)

In [5]:
# Clean data frame of null values
fixNullValues(dfBoth)

In [6]:
# Get a sorted list of the numbers of Null values
missingVals = dfBoth.isnull().sum()
missingVals = missingVals[missingVals > 0]
missingVals.sort_values()

Series([], dtype: int64)

In [7]:
## removing test set from data since it does not contain the saleprice and we need to look at categorical values
dfTrainC = dfBoth.loc['train']
# dfTrainC.to_csv('dfTrainC.csv',index=False)

In [8]:
# Create a list of the categorical variables
catColList = dfBoth.select_dtypes(include=['object']).columns.tolist()

### Create DF with Dummy Variables

In [9]:
def getDummies(df):
    '''Creates dummy variables from categorical variables, adds them
    to the dataframe and returns the new dataframe with dummy vars'''
    dummyDF = df.copy()
    catList = dummyDF.select_dtypes(include=['object']).columns.tolist()
    for i in catList:
        dummies = pd.get_dummies(dummyDF[i], prefix=i)
        dummyDF = pd.concat([dummyDF, dummies], axis=1)
    return dummyDF

In [10]:
# Add dummy variable columns to dataframes
dfBoth_dummies = getDummies(dfBoth)
dfTrainC_dummies = getDummies(dfTrainC)

### Data Analysis

In [11]:
# Separate data so as to only analyze training data, which contains SalePrice
dfAnalysis = dfBoth_dummies.loc['train'].copy()
dfAnalysis['SalePrice'] = pd.to_numeric(dfAnalysis['SalePrice'])
# Save file
# dfAnalysis.to_csv('dfAnalysis.csv',index=False)

In [12]:
# Move 'SalePrice' column to front
salePrice = dfAnalysis['SalePrice']
dfAnalysis.drop(labels=['SalePrice'], axis=1, inplace=True)
dfAnalysis.insert(0, 'SalePrice', salePrice)

#### Get a list of best variables using Feature Selection

In [13]:
# Load libraries
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Split data for feature selection:

In [14]:
dfAnalysis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Columns: 347 entries, SalePrice to SaleCondition_Partial
dtypes: float64(12), int64(26), object(43), uint8(266)
memory usage: 1.3+ MB


In [15]:
numDF = dfAnalysis.select_dtypes(exclude=['object'])

In [16]:
numDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Columns: 304 entries, SalePrice to SaleCondition_Partial
dtypes: float64(12), int64(26), uint8(266)
memory usage: 824.1 KB


In [17]:
# Set up features target sets
X = numDF[numDF.columns[1:]]
y = numDF.SalePrice

# Encode the target variables
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, 
                                                    random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test  = scaler.transform(X_test)

#### Feature Selection

In [18]:
%%time
# Feature selection using F-statistic
from sklearn.feature_selection import f_classif
f = f_classif(X_train, y_train)[0]

Wall time: 187 ms


  f = msb / msw
  f = msb / msw


In [19]:
%%time
# Feature selection using mutual information
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X_train, y_train)

Wall time: 25.8 s


In [20]:
%%time
# Feature selection using logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=500).fit(X_train, y_train)

Wall time: 17.4 s


In [21]:
%%time
# Feature selection using light gbm
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(
    objective = 'multiclass',
    metric = 'multi_logloss',
    importance_type = 'gain'
).fit(X_train, y_train)

Wall time: 26.1 s


Store Results

In [22]:
%%time
# Create DF to store feature ranking info
ranking = pd.DataFrame(index = range(X_train.shape[1]))

# Store ranking info for each feature from each method
ranking['feat'] = X.columns
ranking['f'] = pd.Series(f, index = ranking.index).fillna(0).rank(ascending = False)
ranking['mi'] = pd.Series(mi, index = ranking.index).fillna(0).rank(ascending = False)
ranking['logreg'] = pd.Series(np.abs(logreg.coef_).mean(axis = 0), index = ranking.index).rank(ascending = False)
ranking['lightgbm'] = pd.Series(lgbm.feature_importances_, index = ranking.index).rank(ascending = False)
# ranking['mrmr'] = pd.Series(list(range(1, len(mrmr) + 1)) + [X_train.shape[1]] * (X_train.shape[1] - len(mrmr)), index = mrmr + list(set(ranking.index) - set(mrmr))).sort_index()

# ranking = ranking.replace(to_replace = ranking.max(), value = X_train.shape[1]) # Not sure why this was in here
ranking.to_csv('ranking.csv', index = False)

Wall time: 18.3 ms


In [23]:
ranking.sort_values('logreg')

Unnamed: 0,feat,f,mi,logreg,lightgbm
0,Id,184.0,301.0,1.0,1.0
36,YrSold,130.0,284.0,2.0,25.0
35,MoSold,164.0,296.0,3.0,15.0
257,GarageFinish_RFn,79.0,251.0,4.0,152.0
47,LotShape_IR1,101.0,268.0,5.0,98.0
...,...,...,...,...,...
141,Exterior1st_ImStucc,301.0,298.0,301.0,242.0
102,Condition2_PosA,301.0,301.0,301.0,242.0
106,Condition2_RRNn,301.0,297.0,301.0,242.0
229,Electrical_Mix,301.0,301.0,301.0,242.0


Compare

In [24]:
# Sum the rankings across methods
ranking['total'] = ranking.sum(axis=1)
# Sort by sum to get an overall idea of the most/least useful features
ranking.sort_values('total')

  ranking['total'] = ranking.sum(axis=1)


Unnamed: 0,feat,f,mi,logreg,lightgbm,total
80,Neighborhood_NoRidge,12.0,31.0,113.0,31.0,187.0
294,SaleType_New,18.0,7.0,66.0,97.0,188.0
87,Neighborhood_StoneBr,34.0,8.0,152.0,27.0,221.0
169,ExterQual_Ex,15.0,1.0,133.0,82.0,231.0
38,MSZoning_FV,35.0,41.0,104.0,69.0,249.0
...,...,...,...,...,...,...
106,Condition2_RRNn,301.0,297.0,301.0,242.0,1141.0
141,Exterior1st_ImStucc,301.0,298.0,301.0,242.0,1142.0
229,Electrical_Mix,301.0,301.0,301.0,242.0,1145.0
102,Condition2_PosA,301.0,301.0,301.0,242.0,1145.0


In [25]:
ranked = ranking.sort_values('total')
ranked.head(50)

Unnamed: 0,feat,f,mi,logreg,lightgbm,total
80,Neighborhood_NoRidge,12.0,31.0,113.0,31.0,187.0
294,SaleType_New,18.0,7.0,66.0,97.0,188.0
87,Neighborhood_StoneBr,34.0,8.0,152.0,27.0,221.0
169,ExterQual_Ex,15.0,1.0,133.0,82.0,231.0
38,MSZoning_FV,35.0,41.0,104.0,69.0,249.0
70,Neighborhood_CollgCr,58.0,165.0,12.0,28.0,263.0
184,BsmtQual_Ex,19.0,25.0,64.0,164.0,272.0
231,KitchenQual_Ex,11.0,72.0,87.0,110.0,280.0
251,GarageType_BuiltIn,51.0,83.0,56.0,99.0,289.0
52,LandContour_HLS,28.0,100.0,137.0,40.0,305.0


In [26]:
# ranked.feat[:20]

#### Save Data Frames with the top 30(?) features by each method

In [27]:
# Set the number of features to keep
numFeat = 30

In [28]:
## F-Statistic
# Get a list of the top features
fStat = ranking.sort_values('f')
fStat = fStat.feat[:numFeat].tolist()
# Generate a DF of just those features
fStatDF = numDF[fStat]
# Add target variable to DF
fStatDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
fStatDF.to_csv('featFstatistic.csv', index=False)

In [29]:
## Mutual Information
# Get a list of the top features
miFeat = ranking.sort_values('mi')
miFeat = miFeat.feat[:numFeat].tolist()
# Generate a DF of just those features
miFeatDF = numDF[miFeat]
# Add target variable to DF
miFeatDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
miFeatDF.to_csv('featMutualInformation.csv', index=False)

In [30]:
## Logistic Regression
# Get a list of the top features
lrFeat = ranking.sort_values('logreg')
lrFeat = lrFeat.feat[:numFeat].tolist()
# Generate a DF of just those features
lrFeatDF = numDF[lrFeat]
# Add target variable to DF
lrFeatDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
lrFeatDF.to_csv('featLogisticRegression.csv', index=False)

In [31]:
## Logistic Regression
# Get a list of the top features
featLBGM = ranking.sort_values('lightgbm')
featLBGM = featLBGM.feat[:numFeat].tolist()
# Generate a DF of just those features
featLBGMDF = numDF[featLBGM]
# Add target variable to DF
featLBGMDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
featLBGMDF.to_csv('featLightBGM.csv', index=False)

In [33]:
## Combined Overall
# Get a list of the top features
overall = ranking.sort_values('total')
overall = overall.feat[:numFeat].tolist()
# Generate a DF of just those features
overallDF = numDF[overall]
# Add target variable to DF
overallDF.insert(0, 'SalePrice', salePrice)
# Save DF to CSV file
overallDF.to_csv('featOverall.csv', index=False)