In [0]:
import pandas as pd 
from numpy import array
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
import operator

In [0]:
data = pd.read_csv('train_data.csv')

**The cell below is to get the number of empty values in the columns and filtered to only display columns which have empty values**

In [0]:
null_counts = list(data.isna().sum())
null_dict = {}
cols = list(data.columns)
for key in cols:
  null_dict[key] = null_counts.pop(0)

null_dict_2 = dict(filter(lambda elem: elem[1] != 0, null_dict.items()))
null_dict_2

In [0]:
data['Alley'].fillna('None', inplace=True)
data['BsmtCond'].fillna('None', inplace=True)
data['BsmtExposure'].fillna('None', inplace=True)
data['BsmtFinType1'].fillna('None', inplace=True)
data['BsmtFinType2'].fillna('None', inplace=True)
data['BsmtQual'].fillna('None', inplace=True)

#data = data[data.Electrical.notnull()]  #Drop row with empty value for this attribute since it is only one row
data['Electrical'].fillna('None', inplace=True) #Empty value represents no electricity

data['Fence'].fillna('None', inplace=True)
data['FireplaceQu'].fillna('None', inplace=True)
data['GarageCond'].fillna('None', inplace=True)
data['GarageFinish'].fillna('None', inplace=True)
data['GarageQual'].fillna('None', inplace=True)
data['GarageType'].fillna('None', inplace=True)
data['GarageYrBlt'].fillna(0, inplace=True) #The empty values are for no garage. Chose Year 0 to represent it 

data['LotFrontage'].fillna(0, inplace=True) #For empty values, assume no Street connected to property

data['MasVnrArea'].fillna(0, inplace=True) 
data['MasVnrType'].fillna('None', inplace=True)  
data['MiscFeature'].fillna('None', inplace=True)
data['PoolQC'].fillna('None', inplace=True)

In [0]:
nominal = ['Neighborhood','Condition1' , 'Condition2', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd']
nominal.extend(['MasVnrType', 'Electrical', 'PavedDrive', 'Foundation', 'Heating', 'GarageType', 'MiscFeature'])
nominal.extend(['SaleType', 'SaleCondition'])

ordinal = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope']
ordinal.extend(['BldgType', 'HouseStyle', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure'])
ordinal.extend(['BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional'])
ordinal.extend(['FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence'])

numerical = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd']
numerical.extend(['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF'])
numerical.extend(['LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'])
numerical.extend(['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea'])
numerical.extend(['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea'])
numerical.extend(['MiscVal', 'MoSold', 'YrSold'])

target = ['SalePrice']

In [0]:
categorical_data = data.select_dtypes(include=['object']).copy()

In [0]:
column_names=list(categorical_data.columns)
for feature in column_names:
  categorical_data[feature]=categorical_data[feature].astype('category')
for feature in ordinal:
  categorical_data[feature]=categorical_data[feature].cat.codes
#categorical_data.head()

In [0]:
data_copy=categorical_data.copy()
nominal_encoded = pd.get_dummies(data_copy[nominal])
data_copy = pd.concat([nominal_encoded,data_copy],axis=1) 
data_copy.drop(nominal,axis=1,inplace=True)
new_data = pd.concat([data_copy,data[numerical],data[target]],axis=1)

In [0]:
all_features = list(new_data.columns)
X = new_data.loc[:,all_features].values
y = new_data.loc[:,["SalePrice"]].values

In [0]:
X = StandardScaler().fit_transform(X)

In [0]:
pca = PCA(n_components=2)
p_components = pca.fit_transform(X)
pca_df = pd.DataFrame(p_components, columns=['PC1', 'PC2'])
pca.explained_variance_ratio_

PC1 Contains 7.2131% of information of the dataset

---

PC2 Contains 3.1260% of information of the dataset

In [71]:
pca_var = PCA(.9)
p_components = pca_var.fit_transform(X)
#len(pca_var.explained_variance_ratio_)
pca.n_components_

117

To get 90% variance you would need 117 principal components

In [0]:
final_data = pd.concat([pca_df,data['target']], axis=1)