In [88]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
seed=862#for getting reproduceble results

In [89]:
# Load data set
data = pd.read_csv('train.csv')
data = data.drop('Id', axis = 1)

# Remove columns that have too many missing values
data = data.drop(data.columns[data.isnull().sum() > 30], axis = 1)

# Remove missing values
data.dropna(inplace = True)

In [90]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [91]:
#Baseline Regression on Original Dataset
X = data.drop(['SalePrice'], axis=1)
y = data['SalePrice'].values

In [92]:
# First split the training and testing set
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.25, random_state=seed)

print('Shapes')
print('X_train_valid : ',X_train_valid.shape)
print('X_test        : ',X_test.shape)
print('y_train_valid : ',y_train_valid.shape)
print('y_test        : ',y_test.shape)
print('X_train       : ',X_train.shape)
print('X_valid       : ',X_valid.shape)
print('y_train       : ',y_train.shape)
print('y_valid       : ',y_valid.shape)

Shapes
X_train_valid :  (1160, 63)
X_test        :  (291, 63)
y_train_valid :  (1160,)
y_test        :  (291,)
X_train       :  (870, 63)
X_valid       :  (290, 63)
y_train       :  (870,)
y_valid       :  (290,)


In [93]:
#Splitting numeric and categorical columns

X_train_numeric_col = X_train.select_dtypes(include=[np.number])#PCA
X_train_categorical_col = X_train.select_dtypes(exclude=[np.number])#MCA

X_valid_numeric_col = X_valid.select_dtypes(include=[np.number])#PCA
X_valid_categorical_col = X_valid.select_dtypes(exclude=[np.number])#MCA

X_test_numeric_col = X_test.select_dtypes(include=[np.number])#PCA
X_test_categorical_col = X_test.select_dtypes(exclude=[np.number])#MCA

print('X_train_numeric_col         : ',X_train_numeric_col.shape)
print('X_train_categorical_col     : ',X_train_categorical_col.shape)
print('X_valid_numeric_col         : ',X_valid_numeric_col.shape)
print('X_valid_categorical_col     : ',X_valid_categorical_col.shape)
print('X_test_numeric_col          : ',X_test_numeric_col.shape)
print('X_test_categorical_col      : ',X_test_categorical_col.shape)

X_train_numeric_col         :  (870, 34)
X_train_categorical_col     :  (870, 29)
X_valid_numeric_col         :  (290, 34)
X_valid_categorical_col     :  (290, 29)
X_test_numeric_col          :  (291, 34)
X_test_categorical_col      :  (291, 29)


In [94]:
'''
This is the script you can use to check the categorical features between training and testing sets.
You can apply this code after you can done the train/test split, as well as separation of numerical
and categorical features.
'''

# Categorical features in training set is called X_train_cat.
# Categorical features in testing set is called X_test_set.
# Make sure the training feature and testing feature has same number of levels
keep = X_train_categorical_col.nunique() == X_test_categorical_col.nunique()
X_train_categorical_col = X_train_categorical_col[X_train_categorical_col.columns[keep]]
X_test_categorical_col = X_test_categorical_col[X_test_categorical_col.columns[keep]]

# For categorical features that have same levels, make sure the classes are the same
keep = []
for i in range(X_train_categorical_col.shape[1]):
    keep.append(all(np.sort(X_train_categorical_col.iloc[:,i].unique()) == np.sort(X_test_categorical_col.iloc[:,i].unique())))
X_train_categorical_col = X_train_categorical_col[X_train_categorical_col.columns[keep]]
X_test_categorical_col = X_test_categorical_col[X_test_categorical_col.columns[keep]]

print('X_train_categorical_col shape : ',X_train_categorical_col.shape)
print('X_test_categorical_col  shape : ', X_test_categorical_col.shape)

X_train_categorical_col shape :  (870, 14)
X_test_categorical_col  shape :  (291, 14)


In [103]:
# Scaling data
# Numeric Columns for PCA
# Categorical Columns for MCA
scaler = StandardScaler()

scaler.fit(X_train_numeric_col)

X_train_numeric_col_scaled = pd.DataFrame(scaler.transform(X_train_numeric_col))
X_train_numeric_col_scaled.columns = X_train_numeric_col.columns.values

X_valid_numeric_col_scaled = pd.DataFrame(scaler.transform(X_valid_numeric_col))
X_valid_numeric_col_scaled.columns = X_valid_numeric_col.columns.values

In [104]:
pipeline = Pipeline([
    ('pca', KernelPCA(random_state = 862)),
    ('clf', RandomForestClassifier()) # The RandomForestClassifier() is just a placeholder
])
parameters = [
      {
        'pca__kernel': ['linear', 'rbf', 'poly'],
        'pca__n_components': range(2, 30),
        'clf': [RandomForestClassifier()],
        'clf__n_estimators': [50,100,150]
    }, {
        'pca__kernel': ['linear', 'rbf', 'poly'],
        'pca__n_components': range(2, 30),
        'clf': [KNeighborsClassifier()],
        'clf__n_neighbors': range(5,16)
    }
]
GS_kPCA = GridSearchCV(pipeline, parameters, n_jobs = -1, cv = 5)

In [105]:
GS_kPCA.fit(X_train_numeric_col_scaled, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('pca', KernelPCA(random_state=862)),
                                       ('clf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'clf': [RandomForestClassifier()],
                          'clf__n_estimators': [50, 100, 150],
                          'pca__kernel': ['linear', 'rbf', 'poly'],
                          'pca__n_components': range(2, 30)},
                         {'clf': [KNeighborsClassifier()],
                          'clf__n_neighbors': range(5, 16),
                          'pca__kernel': ['linear', 'rbf', 'poly'],
                          'pca__n_components': range(2, 30)}])

In [107]:
GS_kPCA.best_params_

{'clf': RandomForestClassifier(),
 'clf__n_estimators': 100,
 'pca__kernel': 'poly',
 'pca__n_components': 18}

In [111]:
kpca= KernelPCA(kernel='poly', n_components=18, random_state=862)
X_kpca_reduced = kpca.fit_transform(X_train_numeric_col_scaled)#reduced features

In [112]:
X_kpca_reduced

array([[-0.14873294, -0.23804113, -0.09342185, ...,  0.03743906,
         0.05029464, -0.43764423],
       [-0.15270217, -0.05056568, -0.07588408, ..., -1.36622267,
         0.56139965,  0.27933719],
       [-0.14787777, -0.2501955 , -0.11206482, ...,  0.1549703 ,
        -0.55347854,  0.41312724],
       ...,
       [-0.14143635, -0.24188367, -0.08820206, ..., -0.03370817,
        -0.01230496, -0.01840397],
       [-0.14740068, -0.1530348 , -0.05027048, ...,  0.33960732,
        -0.21846359,  0.02482281],
       [-0.14026309, -0.24685554, -0.09107302, ..., -0.34373894,
         0.15122542, -0.31685357]])

# Ridge Regression 

In [50]:
# Creating dummy variables
numeric_col = data.select_dtypes(include=[np.number])#PCA
categorical_col = data.select_dtypes(exclude=[np.number])#MCA
cat_dummy= pd.get_dummies(data = categorical_col, columns =categorical_col.columns , drop_first = True)
cat_dummy.head()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [51]:
data=numeric_col.merge(cat_dummy,left_index=True,right_index=True)

In [52]:
#Baseline Regression on Original Dataset
X = data.drop(['SalePrice'], axis=1)
y = data['SalePrice'].values

In [53]:
# First split the training and testing set
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)

In [54]:
# Now let's scale our data
scaler = StandardScaler() # Instantiate
scaler.fit(X_train_valid) # First fit the data, i.e. learn the mean and sd
X_train_valid = pd.DataFrame(scaler.transform(X_train_valid)) # Then transform the data. We can also use fit_transform
X_test = pd.DataFrame(scaler.transform(X_test)) # Transform the testing set
X_train_valid.columns = X.columns.values
X_test.columns = X.columns.values
print(X_train_valid.mean())
print(X_train_valid.std())
# Remember, the steps are fit, then transform

MSSubClass               5.390803e-17
LotArea                  6.345499e-17
OverallQual              8.728650e-17
OverallCond             -6.010518e-17
YearBuilt               -1.351218e-15
                             ...     
SaleCondition_AdjLand   -1.290874e-16
SaleCondition_Alloca     1.358827e-16
SaleCondition_Family     1.040834e-16
SaleCondition_Normal     2.388894e-16
SaleCondition_Partial    5.056300e-16
Length: 196, dtype: float64
MSSubClass               1.000431
LotArea                  1.000431
OverallQual              1.000431
OverallCond              1.000431
YearBuilt                1.000431
                           ...   
SaleCondition_AdjLand    1.000431
SaleCondition_Alloca     1.000431
SaleCondition_Family     1.000431
SaleCondition_Normal     1.000431
SaleCondition_Partial    1.000431
Length: 196, dtype: float64


In [55]:
# Now we can split the training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.25, random_state = seed)

In [63]:
# Setting up lambda values 

alphas = np.logspace(-10,5,10)

Validation_Scores = []
for a in alphas:
    lm = Ridge(alpha=a)
    lm.fit(X_train, y_train) # Fit model on training set
    Validation_Scores.append(mean_squared_error(lm.predict(X_valid), y_valid)) # Evaluate model on validation set

In [64]:
# Find minimum validation error and the corresponding lambda value
min(Validation_Scores)
alphas[np.argmin(Validation_Scores)]

1.0

In [65]:
# Scale data (use the train_valid set to fit, then transform both the train_valid set and test set)
scaler.fit(X_train_valid) 
X_train_valid = pd.DataFrame(scaler.transform(X_train_valid)) 
X_test = pd.DataFrame(scaler.transform(X_test)) 

In [66]:
lm = Ridge(alpha=alphas[np.argmin(Validation_Scores)])
lm.fit(X_train_valid, y_train_valid)
print(pd.DataFrame(zip(lm.coef_,X.columns)))
print("The prediction error on the testing set is", np.sqrt(mean_squared_error(lm.predict(X_test), y_test)))

                0                      1
0      -95.310533             MSSubClass
1     7006.088456                LotArea
2    10660.383685            OverallQual
3     6217.025980            OverallCond
4     8069.401859              YearBuilt
..            ...                    ...
191    644.758312  SaleCondition_AdjLand
192   1019.753257   SaleCondition_Alloca
193   -157.956312   SaleCondition_Family
194   2151.355901   SaleCondition_Normal
195   6465.452339  SaleCondition_Partial

[196 rows x 2 columns]
The prediction error on the testing set is 44831.90977866733


In [78]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train,y_train)

LinearRegression()

In [79]:
y_pred = linear.predict(X_test)

In [80]:
from sklearn import metrics

In [81]:
metrics.r2_score(y_test, y_pred)

0.8386263937150922