In [2]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, VarianceThreshold
from sklearn.impute import SimpleImputer 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.svm import SVC

# reading
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/housing_prices.csv") 
data = data.set_index('Id')
data.isna().sum()
data

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
#data.columns
data['MSZoning'].apply(str)


Id
1       RL
2       RL
3       RL
4       RL
5       RL
        ..
1456    RL
1457    RL
1458    RL
1459    RL
1460    RL
Name: MSZoning, Length: 1460, dtype: object

In [4]:
data.shape

(1460, 80)

In [5]:
from sklearn.model_selection import train_test_split
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123000)
X_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
74,20,RL,85.0,10200,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdWo,,0,5,2010,WD,Normal
461,60,FV,75.0,8004,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,12,2009,New,Partial
483,70,RM,50.0,2500,Pave,Pave,Reg,Lvl,AllPub,Corner,...,0,0,,,,0,6,2009,WD,Normal
114,20,RL,,21000,Pave,,Reg,Bnk,AllPub,Corner,...,0,0,,MnPrv,,0,10,2007,COD,Abnorml
110,20,RL,105.0,11751,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,1,2010,COD,Normal


Pipelines can contain many different steps inside. I would divide them into 2 groups: Preprocessing pipelines and Modelling pipelines. A Modelling pipeline has a model as their last step, whereas a preprocessing pipeline doesn't.

- Preprocessing pipelines: Those pipelines only transform the predictor features (the X) by filling NAs, encoding categorical features, scaling, etc. You always have to fit them with X_train. Then, you can call the .transform() method to transform both the X_train and the X_test. (Sometimes, you fit and transform X_train in a single step, by using the .fit_transform() method, but you're still performing these 2 separate steps). Any time that you call transform() you get as an output the transformed data, X_train or X_test.

In [7]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

# create numerical pipeline, only with the SimpleImputer(strategy="median") and data scaling
scaler = MinMaxScaler()
numeric_pipe = make_pipeline(scaler,
                             SimpleImputer(strategy="median"))
                             
 
 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
OHEncoder = OneHotEncoder(handle_unknown = 'ignore', sparse=False)

categoric_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"), #    "constant", fill_value="N_A"
    OHEncoder
)

In [8]:
# display pipeline
from sklearn import set_config
set_config(display = 'diagram')

In [9]:
from sklearn.compose import ColumnTransformer  #make_column_Transformer then dont need to mention names

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns)
      
    ]
)
preprocessor

In [10]:
full_pipeline = make_pipeline(preprocessor)
#full_pipeline.named_steps

In [11]:
full_pipeline.fit(X_train, y_train)


1. Create your preprocessing pipeline, let's say it's full_pipeline.
2. Fit it to the train set: full_pipeline.fit(X_train)
3. Transform the train set: X_train_preprocessed = full_pipeline.transform(X_train)
4. Transform the test set: X_test_preprocessed = full_pipeline.transform(X_test)
Note that so far we have not needed neither the y_train nor the y_test. This is because we are not modelling yet. We are just preprocessing data (cleaning, transforming, wrangling, preparing, imputing, encoding... only the predictors! not the target!)
5. Fit a model to the preprocessed train set:

In [None]:
X_train_cleaned = full_pipeline.transform(X_train)
X_train_cleaned.shape

In [47]:
X_test_cleaned = full_pipeline.transform(X_test)


In [14]:
X_train_cleaned_df = pd.DataFrame(X_train_cleaned) #X_train_cleaned_df.columns.to_list()
X_train_cleaned_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,273,274,275,276,277,278,279,280,281,282
0,0.000000,0.219178,0.040806,0.444444,0.750,0.594203,0.883333,0.065000,0.056697,0.323214,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.235294,0.184932,0.030533,0.777778,0.500,0.992754,0.983333,0.068750,0.096386,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.294118,0.099315,0.004786,0.666667,0.875,0.311594,0.916667,0.000000,0.052977,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.000000,0.164384,0.091328,0.555556,0.500,0.586957,0.050000,0.115000,0.006201,0.775893,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.287671,0.048061,0.555556,0.625,0.760870,0.450000,0.300000,0.124911,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,0.411765,0.219178,0.046770,0.444444,0.500,0.623188,0.133333,0.059375,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1164,0.235294,0.332192,0.160375,1.000000,0.500,0.891304,0.766667,0.861250,0.245748,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1165,0.058824,0.133562,0.026772,0.444444,0.750,0.492754,0.700000,0.183750,0.090361,0.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1166,0.000000,0.164384,0.057085,0.222222,0.500,0.601449,0.083333,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
X_test_cleaned_df = pd.DataFrame(X_test_cleaned) 
X_test_cleaned_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,273,274,275,276,277,278,279,280,281,282
0,0.176471,0.133562,0.026772,0.444444,0.375,0.565217,0.533333,0.000000,0.031892,0.314286,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.411765,0.208904,0.026473,0.555556,0.625,0.536232,0.000000,0.264375,0.085755,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.000000,0.263699,0.068093,0.888889,0.500,0.956522,0.916667,0.236250,0.223423,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.823529,0.010274,0.004888,0.666667,0.500,0.956522,0.900000,0.031250,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.000000,0.219178,0.041985,0.555556,0.500,0.586957,0.050000,0.135000,0.088590,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,0.294118,0.099315,0.035192,0.666667,1.000,0.347826,0.633333,0.000000,0.110560,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
288,0.235294,0.150685,0.031099,0.666667,0.500,0.971014,0.950000,0.062500,0.143870,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
289,0.235294,0.222603,0.061380,0.555556,0.500,0.978261,0.950000,0.046250,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
290,0.176471,0.133562,0.021158,0.555556,0.625,0.500000,0.000000,0.000000,0.066442,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
#colnames = my_onehot.get_feature_names_out(X_cat_imputed.columns)
#df.columns = colnames
#df.head()

In [17]:
#full_pipeline['columntransformer'].transformers_[1][1]['onehotencoder'].get_feature_names_out(X_cat_columns)

5. Fit a model to the preprocessed train set:
my_tree = DesicionTreeRegressor()
my_tree.fit(X_train_preprocessed)

Here I'm not using Grid Search Cross Validation for simplicity.
6. Use the model to make predictions on either the train or the test sets. The model is only capable of making good predictions if you give it data that looks exactly like the data it was fitted on. This is why we preprocessed the test set on step 4. Now you can do: test_pred = my_tree.predict(X_test_preprocessed) and you'll have predictions as your output.
7. Compute performance metrics.

### Quick baseline models (baseline for performance)

In [18]:

# Decision tree
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor() # initialize
tree.fit(X_train_cleaned, y_train)

# K Nearest neighbors
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=1) # initialize with parameters
neigh.fit(X_train_cleaned, y_train) # fit to train set

#### Performance on the test set

Let's check the R Squared of the model

In [19]:
from sklearn.metrics import r2_score

tree_pred = tree.predict(X = X_test_cleaned) # Decision tree test data
knn_pred = neigh.predict(X = X_test_cleaned) # K Nearest neighbors


performances = {}

performances["baseline_tree"]= r2_score(y_test, tree_pred)
performances["baseline_knn"] = r2_score(y_test, knn_pred)

performances


{'baseline_knn': 0.5896397369019726, 'baseline_tree': 0.81215958795593}

- Modelling pipelines:  When the last step of the pipeline is a model (e.g. a decision tree), you will do the whole process in fewer steps. Specifically, you will never be creating the X_train_preprocessed and the X_test_preprocessed from our previous examples. When you fit such a pipeline, you are fitting your transformers (scaler, imputer...), preprocessing your data (transforming it), passing the transformed data directly to the model and fitting the model all at once. Here the steps would be:
1. Create your modelling pipeline, let's call it modelling_pipe 
2. Fit it to the train set: modelling_pipe.fit(X_train)
3. Use the pipeline directly to make predictions. This pipeline can take "raw" data as input and give you predictions as the output, since it contains all preprocessing and modelling steps inside: test_pred = modelling_pipe.predict(X_test)

You can already see how a Decision Tree handles a noisy dataset much better than a knn. In a Decision Tree, only the "best" features take part of the algorithm, whereas in a KNN all features matter the same. It does not mean that the Decision Tree will always be the better algorithm: after preprocessing, it's possible that the KNN works better.

###. Variance Threshold

This is the first feature selection method we will use. It's extremely simple.

Features with a low variance have a low amount of information. With this transformer we can drop the features with the lowest variance. We will set a threshold and all features with a variance below that number will be dropped. 

Notice first that the variance of a column also depends on its scale. In our dataset, most of the columns have a really small range, except for some features:

In [20]:
X.describe().loc[["min", "max"]].T

Unnamed: 0,min,max
MSSubClass,20.0,190.0
LotFrontage,21.0,313.0
LotArea,1300.0,215245.0
OverallQual,1.0,10.0
OverallCond,1.0,9.0
YearBuilt,1872.0,2010.0
YearRemodAdd,1950.0,2010.0
MasVnrArea,0.0,1600.0
BsmtFinSF1,0.0,5644.0
BsmtFinSF2,0.0,1474.0


In [21]:
X.var().sort_values()

  """Entry point for launching an IPython kernel.


KitchenAbvGr     4.854892e-02
BsmtHalfBath     5.700283e-02
HalfBath         2.528937e-01
BsmtFullBath     2.692682e-01
FullBath         3.035082e-01
Fireplaces       4.155947e-01
GarageCars       5.584797e-01
BedroomAbvGr     6.654938e-01
OverallCond      1.238322e+00
YrSold           1.763837e+00
OverallQual      1.912679e+00
TotRmsAbvGrd     2.641903e+00
MoSold           7.309595e+00
YearRemodAdd     4.262328e+02
LotFrontage      5.897492e+02
GarageYrBlt      6.095825e+02
3SsnPorch        8.595059e+02
YearBuilt        9.122154e+02
PoolArea         1.614216e+03
MSSubClass       1.789338e+03
LowQualFinSF     2.364204e+03
ScreenPorch      3.108889e+03
EnclosedPorch    3.735550e+03
OpenPorchSF      4.389861e+03
WoodDeckSF       1.570981e+04
BsmtFinSF2       2.602391e+04
MasVnrArea       3.278497e+04
GarageArea       4.571251e+04
1stFlrSF         1.494501e+05
2ndFlrSF         1.905571e+05
TotalBsmtSF      1.924624e+05
BsmtUnfSF        1.952464e+05
BsmtFinSF1       2.080255e+05
MiscVal   

#### Data Scaling
Therefore, it's a good idea to scale the features before removing those with a smaller variance. Some scaling processes (e.g. standardization) transform features in such a way that they all end up having the same variance. It's important to pick a scaler that does not do that. We will use min-max scaling:

In [22]:
from sklearn.preprocessing import MinMaxScaler

my_scaler = MinMaxScaler()

X_train_scaled = my_scaler.fit_transform(X_train_cleaned_df)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_cleaned_df.columns)
X_train_scaled


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,273,274,275,276,277,278,279,280,281,282
0,0.000000,0.219178,0.040806,0.444444,0.750,0.594203,0.883333,0.065000,0.056697,0.323214,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.235294,0.184932,0.030533,0.777778,0.500,0.992754,0.983333,0.068750,0.096386,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.294118,0.099315,0.004786,0.666667,0.875,0.311594,0.916667,0.000000,0.052977,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.000000,0.164384,0.091328,0.555556,0.500,0.586957,0.050000,0.115000,0.006201,0.775893,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.287671,0.048061,0.555556,0.625,0.760870,0.450000,0.300000,0.124911,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,0.411765,0.219178,0.046770,0.444444,0.500,0.623188,0.133333,0.059375,0.000000,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1164,0.235294,0.332192,0.160375,1.000000,0.500,0.891304,0.766667,0.861250,0.245748,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1165,0.058824,0.133562,0.026772,0.444444,0.750,0.492754,0.700000,0.183750,0.090361,0.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1166,0.000000,0.164384,0.057085,0.222222,0.500,0.601449,0.083333,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
X_train_scaled.var().sort_values().to_list()

[0.0008561643835616435,
 0.0008561643835616435,
 0.0008561643835616435,
 0.0008561643835616435,
 0.0008561643835616435,
 0.0008561643835616435,
 0.0008561643835616436,
 0.0008561643835616436,
 0.0008561643835616436,
 0.0008561643835616436,
 0.0008561643835616436,
 0.0008561643835616436,
 0.0008561643835616436,
 0.0008561643835616437,
 0.0008561643835616437,
 0.0008561643835616437,
 0.0008561643835616438,
 0.0008561643835616438,
 0.0008561643835616438,
 0.0008561643835616438,
 0.0008561643835616438,
 0.0008561643835616439,
 0.0010042262355785436,
 0.0017108614759775095,
 0.0017108614759775095,
 0.0017108614759775095,
 0.0017108614759775095,
 0.0017108614759775095,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0017108614759775097,
 0.0022006880129144283,
 0.0025640912772

In [24]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.0009)
train_X_prep = selector.fit_transform(X_train_scaled)

In [25]:
print("shape before:", X_train_scaled.shape)
print("shape after:", train_X_prep.shape)

shape before: (1168, 283)
shape after: (1168, 261)


In [26]:
colnames = X_train_cleaned_df.columns[selector.get_support(indices=True)]
train_X_prep = pd.DataFrame(train_X_prep, columns=colnames)
train_X_prep.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,273,274,275,276,277,278,279,280,281,282
0,0.0,0.219178,0.040806,0.444444,0.75,0.594203,0.883333,0.065,0.056697,0.323214,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.235294,0.184932,0.030533,0.777778,0.5,0.992754,0.983333,0.06875,0.096386,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.294118,0.099315,0.004786,0.666667,0.875,0.311594,0.916667,0.0,0.052977,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [27]:
test_set_X_scaled = my_scaler.transform(X_test_cleaned_df) #X_test_cleaned
test_set_X_prep = selector.transform(test_set_X_scaled)

In [28]:
# Decision tree
tree = DecisionTreeRegressor() # initialize
tree.fit(train_X_prep, y_train)
tree_pred = tree.predict(X = test_set_X_prep)

# K Nearest neighbors
neigh = KNeighborsRegressor(n_neighbors=1) # initialize with parameters
neigh.fit(train_X_prep, y_train) # fit to train set
knn_pred = neigh.predict(X = test_set_X_prep)


performances["varThreshold_tree"]= r2_score(y_test, tree_pred)
performances["varThreshold_knn"] = r2_score(y_test, knn_pred)

performances

{'baseline_knn': 0.5896397369019726,
 'baseline_tree': 0.81215958795593,
 'varThreshold_knn': 0.5896397369019726,
 'varThreshold_tree': 0.800288739025085}

In [29]:
selector2 = VarianceThreshold(threshold=0)
train_X_prep = selector2.fit_transform(X_train_scaled)

print("shape before:", X_train_scaled.shape)
print("shape after:", train_X_prep.shape)

shape before: (1168, 283)
shape after: (1168, 283)


In [30]:
test_set_X_prep2 = selector2.transform(test_set_X_scaled)

In [31]:
# Decision tree
tree = DecisionTreeRegressor() # initialize
tree.fit(train_X_prep, y_train)
tree_pred = tree.predict(X = test_set_X_prep2)

# K Nearest neighbors
neigh = KNeighborsRegressor(n_neighbors=1) # initialize with parameters
neigh.fit(train_X_prep, y_train) # fit to train set
knn_pred = neigh.predict(X = test_set_X_prep2)


performances["varThreshold_2_tree"]= r2_score(y_test, tree_pred)
performances["varThreshold_2_knn"] = r2_score(y_test, knn_pred)

performances

{'baseline_knn': 0.5896397369019726,
 'baseline_tree': 0.81215958795593,
 'varThreshold_2_knn': 0.5896397369019726,
 'varThreshold_2_tree': 0.7957779994923654,
 'varThreshold_knn': 0.5896397369019726,
 'varThreshold_tree': 0.800288739025085}

In [32]:
import seaborn as sn
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(30,18))
corrMatrix = X_train_cleaned_df.corr().abs()
sn.heatmap(corrMatrix, annot=True);

Output hidden; open in https://colab.research.google.com to view.

In [33]:
corrMatrix = X_train_cleaned_df.corr().abs()
corrMatrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,273,274,275,276,277,278,279,280,281,282
0,1.000000,0.347392,0.181689,0.028990,0.067517,0.012059,0.029661,0.041959,0.074940,0.055525,...,0.014274,0.044930,0.036077,0.021893,0.003126,0.016836,0.024535,0.007577,0.025855,0.052037
1,0.347392,1.000000,0.323564,0.229961,0.041472,0.103371,0.078255,0.177681,0.237865,0.046173,...,0.055619,0.120790,0.005146,0.080268,0.010423,0.039633,0.020571,0.017408,0.075574,0.118984
2,0.181689,0.323564,1.000000,0.126189,0.011423,0.021431,0.023085,0.117168,0.225901,0.121899,...,0.017113,0.014558,0.002233,0.002862,0.031164,0.015032,0.006673,0.012769,0.012045,0.017686
3,0.028990,0.229961,0.126189,1.000000,0.102594,0.584837,0.542097,0.388574,0.236946,0.051998,...,0.024457,0.328633,0.048795,0.233761,0.111399,0.047611,0.055069,0.014982,0.141150,0.323874
4,0.067517,0.041472,0.011423,0.102594,1.000000,0.365514,0.054347,0.130147,0.030814,0.044440,...,0.022564,0.166597,0.021727,0.170630,0.041401,0.043978,0.031979,0.012146,0.163941,0.161220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,0.016836,0.039633,0.015032,0.047611,0.043978,0.052269,0.046300,0.013223,0.017766,0.017105,...,0.003844,0.018616,0.002428,0.023776,0.016527,1.000000,0.005448,0.007124,0.119354,0.018902
279,0.024535,0.020571,0.006673,0.055069,0.031979,0.001576,0.025692,0.006344,0.008635,0.027115,...,0.006093,0.029511,0.003849,0.037691,0.026199,0.005448,1.000000,0.011294,0.189203,0.029964
280,0.007577,0.017408,0.012769,0.014982,0.012146,0.018863,0.025386,0.000503,0.005916,0.002754,...,0.007969,0.038594,0.005033,0.028767,0.034263,0.007124,0.011294,1.000000,0.247439,0.039187
281,0.025855,0.075574,0.012045,0.141150,0.163941,0.163051,0.130199,0.072950,0.021088,0.036822,...,0.032204,0.646571,0.084323,0.645645,0.574007,0.119354,0.189203,0.247439,1.000000,0.656501


In [34]:
# Select upper triangle of correlation matrix
upper = corrMatrix.where(np.triu(np.ones(corrMatrix.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop

[42, 44, 54, 149, 150, 153, 157, 213, 282]

In [35]:
#drop cols train set
X_train_selected = X_train_cleaned_df.drop(columns=to_drop)

#drop cols test set
X_test_selected = X_test_cleaned_df.drop(columns=to_drop)

In [36]:
# Decision tree
tree = DecisionTreeRegressor() # initialize
tree.fit(X_train_selected, y_train)
tree_pred = tree.predict(X = X_test_selected)

# K Nearest neighbors
neigh = KNeighborsRegressor(n_neighbors=1) # initialize with parameters
neigh.fit(X_train_selected, y_train) # fit to train set
knn_pred = neigh.predict(X = X_test_selected)

performances["collinearity_tree"]= r2_score(y_test, tree_pred)
performances["collinearity_knn"] = r2_score(y_test, knn_pred)

performances

{'baseline_knn': 0.5896397369019726,
 'baseline_tree': 0.81215958795593,
 'collinearity_knn': 0.5580422915738912,
 'collinearity_tree': 0.8100576179330363,
 'varThreshold_2_knn': 0.5896397369019726,
 'varThreshold_2_tree': 0.7957779994923654,
 'varThreshold_knn': 0.5896397369019726,
 'varThreshold_tree': 0.800288739025085}

Select K Best

In [37]:
from sklearn.feature_selection import SelectKBest, f_regression

KBest = SelectKBest(score_func=f_regression, k=100)

In [38]:
# transform train set
train_X_KBest = KBest.fit_transform(X_train_cleaned, y_train)

# transform test set
test_X_KBest = KBest.transform(X_test_cleaned)

In [39]:
train_X_KBest.shape

(1168, 100)

In [40]:
# Decision tree
tree = DecisionTreeRegressor() # initialize
tree.fit(train_X_KBest, y_train)
tree_pred = tree.predict(X = test_X_KBest)

# K Nearest neighbors
neigh = KNeighborsRegressor(n_neighbors=1) # initialize with parameters
neigh.fit(train_X_KBest, y_train) # fit to train set
knn_pred = neigh.predict(X = test_X_KBest)

performances["KBest_tree"]= r2_score(y_test, tree_pred)
performances["KBest_knn"] = r2_score(y_test, knn_pred)

performances

{'KBest_knn': 0.5164985276435017,
 'KBest_tree': 0.7953915530076328,
 'baseline_knn': 0.5896397369019726,
 'baseline_tree': 0.81215958795593,
 'collinearity_knn': 0.5580422915738912,
 'collinearity_tree': 0.8100576179330363,
 'varThreshold_2_knn': 0.5896397369019726,
 'varThreshold_2_tree': 0.7957779994923654,
 'varThreshold_knn': 0.5896397369019726,
 'varThreshold_tree': 0.800288739025085}

RFE

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV

In [42]:
RFE_selector_tree = RFECV(DecisionTreeRegressor())

In [43]:
# Decision tree
RFE_selector_tree.fit(X_train_cleaned, y_train) # train_X_KBest
tree_pred = RFE_selector_tree.predict(X =X_test_cleaned ) # test_X_KBest

performances["RFE_tree"]= r2_score(y_test, tree_pred)

performances

{'KBest_knn': 0.5164985276435017,
 'KBest_tree': 0.7953915530076328,
 'RFE_tree': 0.7899963855910879,
 'baseline_knn': 0.5896397369019726,
 'baseline_tree': 0.81215958795593,
 'collinearity_knn': 0.5580422915738912,
 'collinearity_tree': 0.8100576179330363,
 'varThreshold_2_knn': 0.5896397369019726,
 'varThreshold_2_tree': 0.7957779994923654,
 'varThreshold_knn': 0.5896397369019726,
 'varThreshold_tree': 0.800288739025085}

Select from model

In [44]:
from sklearn.feature_selection import SelectFromModel

select_model_tree = SelectFromModel(DecisionTreeRegressor(), threshold=None)

train_X_selected_model_tree = select_model_tree.fit_transform(X_train_cleaned, y_train)

test_X_selected_model_tree = select_model_tree.transform(X_test_cleaned)


In [45]:
train_X_selected_model_tree.shape

(1168, 17)

In [46]:
# Decision tree
tree = DecisionTreeRegressor() # initialize
tree.fit(train_X_selected_model_tree, y_train)
tree_pred = tree.predict(X = test_X_selected_model_tree)

# K Nearest neighbors
neigh = KNeighborsRegressor(n_neighbors=1) # initialize with parameters
neigh.fit(train_X_selected_model_tree, y_train) # fit to train set
knn_pred = neigh.predict(X = test_X_selected_model_tree)

performances["model_selected_tree"]= r2_score(y_test, tree_pred)
performances["model_selected_knn"] = r2_score(y_test, knn_pred)

performances

{'KBest_knn': 0.5164985276435017,
 'KBest_tree': 0.7953915530076328,
 'RFE_tree': 0.7899963855910879,
 'baseline_knn': 0.5896397369019726,
 'baseline_tree': 0.81215958795593,
 'collinearity_knn': 0.5580422915738912,
 'collinearity_tree': 0.8100576179330363,
 'model_selected_knn': 0.6665824558731592,
 'model_selected_tree': 0.7823940758095103,
 'varThreshold_2_knn': 0.5896397369019726,
 'varThreshold_2_tree': 0.7957779994923654,
 'varThreshold_knn': 0.5896397369019726,
 'varThreshold_tree': 0.800288739025085}