# Supervised Machine Learning: feature selection
Feature selection

---
## 1.&nbsp;Import libraries 💾

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel

---
# Creating the Dataset for Feature Selection


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

url = "https://drive.google.com/file/d/1-PrhRxZgo-UOFKTWV7AAE7poam6pJ2wv/view?usp=drive_link" # regression_model
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)


data.head(2)
#data.columns

X = data.copy()

In [None]:
y = data.pop("SalePrice") # SalePrice is what we want to predict

In [None]:
data = data.drop("Id", axis=1)

---
# Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

In [None]:
X_train.isnull().sum()

MSSubClass         0
MSZoning           0
LotFrontage      217
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 79, dtype: int64

In [None]:
#X_train.info()

# Create a Dummy Model

In [None]:
# Dummy Model
from sklearn.metrics import r2_score

# Create a dummy model that always predicts SalePrice > $150,000
def dummy_model(X):
    return [1 if x >= 150000 else 0 for x in X]

# Make predictions using the dummy model on the training data
dummy_y_train_pred = dummy_model(y_train)

# Calculate the R-squared score for the dummy model on the training data
dummy_r2 = r2_score(y_true=y_train, y_pred=dummy_y_train_pred)

# Print or use 'dummy_r2_train' to evaluate the dummy model's performance on the training data
print("Dummy R-squared (R2) Score on Training Data:", dummy_r2)

Dummy R-squared (R2) Score on Training Data: -5.519404800266533


# Create a Baseline Model

In [None]:
baseline_tree = DecisionTreeRegressor()


In [None]:
# building the pipeline
X_cat = X_train.select_dtypes(exclude="number").copy() # will have all the string-columns
X_num = X_train.select_dtypes(include="number").copy() # will have all number columns


In [None]:
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))


In [None]:
from sklearn.pipeline import make_pipeline, Pipeline

categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore")
)


In [None]:
# Implement the  preprocessor

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [None]:
# Create the pipeline

tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', baseline_tree)
])

#fit the  Pipeline
tree_pipeline.fit(X_train, y_train)


# Define KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Define your KNeighborsRegressor
baseline_knn = KNeighborsRegressor(n_neighbors=1)

# Split X_train into numeric and categorical columns
X_cat = X_train.select_dtypes(exclude="number").copy() # Categorical columns
X_num = X_train.select_dtypes(include="number").copy() # Numeric columns

# Define the pipeline for numeric data
numeric_pipe = make_pipeline(SimpleImputer(strategy="mean"))

# Define the pipeline for categorical data
categorical_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore")
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

# Create the final pipeline including the preprocessor and your KNeighborsRegressor
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', baseline_knn)
])

# Train the model
knn_pipeline.fit(X_train, y_train)



# Making Prediction

In [None]:
# Make predictions on the test data with both models
from sklearn.metrics import mean_squared_error

knn_predictions = knn_pipeline.predict(X_test)
tree_predictions = tree_pipeline.predict(X_test)

# Evaluate the predictions
knn_mse = mean_squared_error(y_test, knn_predictions)
tree_mse = mean_squared_error(y_test, tree_predictions)

print("KNN Mean Squared Error:", knn_mse)
print("Decision Tree Mean Squared Error:", tree_mse)



KNN Mean Squared Error: 2792946663.907534
Decision Tree Mean Squared Error: 1813269841.291096


In [None]:
# r2 score: baseline vs decision_tree	knn

baseline_tree_r2 = r2_score(y_test, knn_predictions)
baseline_knn_r2 = r2_score(y_test, tree_predictions)

performances = pd.DataFrame({'decision_tree': baseline_tree_r2,
                             'knn': baseline_knn_r2},
                            index=['baseline'])

performances

Unnamed: 0,decision_tree,knn
baseline,0.635876,0.763599


---
# Feature selection based only on features

# Variance Threeshold

In [None]:
# Select only numeric columns from X_train
X_train_numeric = X_train.select_dtypes(include='number')




In [None]:
# Now calculate range and variance
range_var_df = pd.DataFrame({
    'Range': X_train_numeric.max() - X_train_numeric.min(),
    'Variance': X_train_numeric.var()
}).sort_values(by='Variance')

print(range_var_df)

                  Range      Variance
KitchenAbvGr        3.0  5.082697e-02
BsmtHalfBath        2.0  5.583263e-02
HalfBath            2.0  2.499530e-01
BsmtFullBath        3.0  2.703682e-01
FullBath            3.0  2.991132e-01
Fireplaces          3.0  4.161913e-01
GarageCars          4.0  5.478388e-01
BedroomAbvGr        8.0  6.477914e-01
OverallCond         8.0  1.245594e+00
YrSold              4.0  1.749375e+00
OverallQual         9.0  1.870382e+00
TotRmsAbvGrd       12.0  2.623446e+00
MoSold             11.0  7.132678e+00
YearRemodAdd       60.0  4.298969e+02
GarageYrBlt       110.0  6.115204e+02
LotFrontage       292.0  6.198616e+02
YearBuilt         138.0  9.409860e+02
3SsnPorch         508.0  9.934892e+02
PoolArea          738.0  1.734598e+03
MSSubClass        170.0  1.808959e+03
LowQualFinSF      572.0  2.293175e+03
ScreenPorch       480.0  3.122703e+03
EnclosedPorch     552.0  3.854327e+03
OpenPorchSF       547.0  4.821022e+03
WoodDeckSF        857.0  1.681844e+04
BsmtFinSF2  

In [None]:
range_var_df.head()

Unnamed: 0,Range,Variance
KitchenAbvGr,3.0,0.050827
BsmtHalfBath,2.0,0.055833
HalfBath,2.0,0.249953
BsmtFullBath,3.0,0.270368
FullBath,3.0,0.299113


In [None]:
range_var_df.tail()

Unnamed: 0,Range,Variance
BsmtUnfSF,2336.0,199241.3
BsmtFinSF1,5644.0,210746.2
GrLivArea,5308.0,275029.6
MiscVal,15500.0,305852.9
LotArea,213945.0,115764000.0


# Scaling the data

In [None]:

# Select only numerical columns for scaling
X_train_numeric = X_train.select_dtypes(include=['number'])

# Initialize the scaler
scaler = MinMaxScaler().set_output(transform="pandas")

# Fit and transform the numerical training data
X_train_scaled_numeric = scaler.fit_transform(X_train_numeric)

# Convert it back to DataFrame if needed
X_train_scaled_numeric_df = pd.DataFrame(X_train_scaled_numeric, index=X_train_numeric.index, columns=X_train_numeric.columns)



In [None]:
(
  pd.DataFrame({
  'Range': X_train_scaled_numeric.max() - X_train_scaled_numeric.min(),
  'Variance': X_train_scaled_numeric.var()})
  .sort_values(by='Variance')
)

Unnamed: 0,Range,Variance
MiscVal,1.0,0.001273
LotArea,1.0,0.002529
PoolArea,1.0,0.003185
3SsnPorch,1.0,0.00385
TotalBsmtSF,1.0,0.005202
KitchenAbvGr,1.0,0.005647
BsmtFinSF1,1.0,0.006616
LowQualFinSF,1.0,0.007009
LotFrontage,1.0,0.00727
1stFlrSF,1.0,0.007873


# Apply the VarianceThreshold transformation.

In [None]:
#selector = VarianceThreshold(threshold=0.02)
# based on my variance data 0.01

selector = VarianceThreshold(threshold=0.01)

X_train_var = selector.fit_transform(X_train_scaled_numeric)

In [None]:
#print("shape before:", X_train_scaled_numeric.shape)
#print("shape after:", X_train_var.shape)

In [None]:
# To scale the numerical part of the test data, use only transform
X_test_numeric = X_test.select_dtypes(include=['number'])
X_test_scaled_numeric = scaler.transform(X_test_numeric)

# Convert it back to DataFrame if needed
X_test_scaled_numeric_df = pd.DataFrame(X_test_scaled_numeric, index=X_test_numeric.index, columns=X_test_numeric.columns)

selector = VarianceThreshold(threshold=0.01)

X_train_var = selector.fit_transform(X_test_scaled_numeric)


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

# Select only numeric columns
X_train_numeric = X_train.select_dtypes(include=['number'])
X_test_numeric = X_test.select_dtypes(include=['number'])

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_numeric)
X_test_imputed = imputer.transform(X_test_numeric)

# Apply VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
X_train_var = selector.fit_transform(X_train_imputed)
X_test_var = selector.transform(X_test_imputed)

# Now fit your models
var_tree = DecisionTreeRegressor()
var_tree.fit(X_train_var, y_train)
var_tree_pred = var_tree.predict(X_test_var)

var_knn = KNeighborsRegressor(n_neighbors=1)
var_knn.fit(X_train_var, y_train)
var_knn_pred = var_knn.predict(X_test_var)

performances.loc["varThreshold_0_01", "decision_tree"] = r2_score(y_test, var_tree_pred)
performances.loc["varThreshold_0_01", "knn"] = r2_score(y_test, var_knn_pred)

performances

Unnamed: 0,decision_tree,knn
baseline,0.635876,0.763599
varThreshold_0_01,0.795512,0.635966


# Scaling the data: 2nd iteration


In [None]:
selector2 = VarianceThreshold(threshold=0.005).set_output(transform="pandas")
X_train_var2 = selector2.fit_transform(X_train_scaled_numeric)

print("shape before:", X_train_scaled_numeric.shape)
print("shape after:", X_train_var2.shape)

shape before: (1168, 36)
shape after: (1168, 32)


In [None]:
X_test_var2 = selector2.transform(X_test_scaled_numeric)

In [None]:
# Decision tree.

# Impute missing values in your data
imputer = SimpleImputer(strategy='mean')  # or 'median', or 'most_frequent'

X_train_imputed = imputer.fit_transform(X_train_var2)
X_test_imputed = imputer.transform(X_test_var2)

# Now fit your model
var2_tree = DecisionTreeRegressor()
var2_tree.fit(X_train_imputed, y_train)
var2_tree_pred = var2_tree.predict(X_test_imputed)


# K-Nearest Neighbors.
var2_knn = KNeighborsRegressor(n_neighbors=1)
var2_knn.fit(X_train_imputed, y_train)
var2_knn_pred = var2_knn.predict(X_test_imputed)


That's made an improvement in both models!

# Collinearity

In [None]:
# Calculate the absolute correlation matrix for the feature matrix X_train_var2

corrMatrix = X_train_var2.corr().abs()

# Set the size of the heatmap to be 18x18 inches
#plt.figure(figsize=(18, 18))

#sns.heatmap(corrMatrix,
#            annot=True,
#            cmap='coolwarm',
#            linewidths=0.5);

In [None]:
#corrMatrix.columns
num_features = len(corrMatrix.columns)
num_features

32

In [None]:
# Set the correlation threshold to consider columns as highly correlated
correlation_threshold = 0.88

# Initialise an empty list to store the pairs of highly correlated columns
highly_correlated_columns = []

# Get the number of features (columns) in the correlation matrix
num_features = len(corrMatrix.columns)

# Loop through the upper triangle of the correlation matrix to find highly correlated columns
# Note: We start from i+1 to avoid redundancy as correlation_matrix is symmetric
for i in range(num_features):
    for j in range(i + 1, num_features):
        # Check if the correlation value between columns i and j is greater than or equal to the threshold
        if corrMatrix.iloc[i, j] >= correlation_threshold:
            # Append the tuple (column_i, column_j) to the list of highly correlated columns
            highly_correlated_columns.append((corrMatrix.columns[i], corrMatrix.columns[j], f"correlation = {round(corrMatrix.iloc[i, j], 2)}"))

# Print the pairs of highly correlated columns
print("Highly correlated columns:", highly_correlated_columns)

Highly correlated columns: [('GarageCars', 'GarageArea', 'correlation = 0.88')]


In [None]:
to_drop = [element_a for element_a, element_b, element_c in highly_correlated_columns]
to_drop

['GarageCars']

In [None]:
# Drop the columns from the train set.
X_train_corr = X_train.drop(columns=to_drop).copy()

# Drop the columns from the test set.
X_test_corr = X_test.drop(columns=to_drop).copy()

In [None]:
# Select only numeric columns
X_train_corr_numeric = X_train_corr.select_dtypes(include=['number'])
X_test_corr_numeric = X_test_corr.select_dtypes(include=['number'])

# Impute missing values in the numeric columns
X_train_corr_imputed = imputer.fit_transform(X_train_corr_numeric)
X_test_corr_imputed = imputer.transform(X_test_corr_numeric)


In [None]:
# Decision tree.
corr_tree = DecisionTreeRegressor()
corr_tree.fit(X_train_corr_imputed, y_train)
corr_tree_pred = corr_tree.predict(X_test_corr_imputed)

# K-Nearest Neighbors.
corr_knn = KNeighborsRegressor(n_neighbors=1)
corr_knn.fit(X_train_corr_imputed, y_train)
corr_knn_pred = corr_knn.predict(X_test_corr_imputed)

# Calculate and store the R-squared scores in the 'performances' DataFrame
performances.loc["collinearity", "decision_tree"] = r2_score(y_test, corr_tree_pred)
performances.loc["collinearity", "knn"] = r2_score(y_test, corr_knn_pred)

# Display the performances DataFrame
performances

Unnamed: 0,decision_tree,knn
baseline,0.635876,0.763599
varThreshold_0_01,0.795512,0.635966
collinearity,0.790468,0.635966


---
# Feature selection based on features and labels

# K Best



In [None]:
# Run the univariate (= one feature at a time) linear regression tests.
# Split your data into numerical and categorical features as you did before
X_num = X_train.select_dtypes(include="number").copy()
X_cat = X_train.select_dtypes(exclude="number").copy()

# Impute missing values in numerical features using mean imputation
imputer = SimpleImputer(strategy="mean")
X_num_imputed = imputer.fit_transform(X_num)

# Calculate F-statistics and p-values for numerical features
f_stat, _ = f_regression(X_num_imputed, y_train)

# Create a DataFrame to store F-statistics and feature names
f_test_results = pd.DataFrame({"Feature": X_num.columns, "F_statistic": f_stat})

# Sort the features by F-statistic in descending order
f_test_results = f_test_results.sort_values(by="F_statistic", ascending=False)

# Print the sorted results
#print(f_test_results)

f_test_results.head(3)

Unnamed: 0,Feature,F_statistic
3,OverallQual,1879.151818
15,GrLivArea,1093.390361
25,GarageCars,813.186816


In [None]:
# Initialise KBest, using the f-regression and setting k=10.
KBest = SelectKBest(score_func=f_regression, k=10)

KBest

As always, after initialising the transformer we fit it to the train set and transform both the train and the test set:

In [None]:
# Separate numeric and categorical columns in your training set
X_train_numeric = X_train.select_dtypes(include=['number'])
X_train_categorical = X_train.select_dtypes(exclude=['number'])

# Impute missing values for numeric columns
numeric_imputer = SimpleImputer(strategy='mean')
X_train_numeric_imputed = numeric_imputer.fit_transform(X_train_numeric)

# Encode categorical columns using one-hot encoding
categorical_encoder = OneHotEncoder(handle_unknown='ignore')
X_train_categorical_encoded = categorical_encoder.fit_transform(X_train_categorical)

# Get the column names for one-hot encoded categorical features
categorical_feature_names = categorical_encoder.get_feature_names_out(input_features=X_train_categorical.columns)

# Combine numeric and one-hot encoded categorical columns after imputation
X_train_imputed = pd.concat([pd.DataFrame(X_train_numeric_imputed, columns=X_train_numeric.columns), pd.DataFrame(X_train_categorical_encoded.toarray(), columns=categorical_feature_names)], axis=1)

# Transform the training set with feature selection
X_train_KBest = KBest.fit_transform(X_train_imputed, y_train).copy()


In [None]:
# Separate numeric and categorical columns in your test set
X_test_numeric = X_test.select_dtypes(include=['number'])
X_test_categorical = X_test.select_dtypes(exclude=['number'])

# Impute missing values for numeric columns in the test set
X_test_numeric_imputed = numeric_imputer.transform(X_test_numeric)

# Encode categorical columns using one-hot encoding for the test set
X_test_categorical_encoded = categorical_encoder.transform(X_test_categorical)

# Combine numeric and one-hot encoded categorical columns after imputation for the test set
X_test_imputed = pd.concat([pd.DataFrame(X_test_numeric_imputed, columns=X_test_numeric.columns), pd.DataFrame(X_test_categorical_encoded.toarray(), columns=categorical_feature_names)], axis=1)

# Transform the test set with feature selection
X_test_KBest = KBest.transform(X_test_imputed).copy()


In [None]:
X_train_KBest.shape, X_test_KBest.shape

((1168, 10), (292, 10))

Let's see how the models perform with these 10 "best" features:

In [None]:
# Decision tree.
k10_tree = DecisionTreeRegressor()
k10_tree.fit(X_train_KBest, y_train)
k10_tree_pred = k10_tree.predict(X_test_KBest)

# K-Nearest Neighbors.
k10_knn = KNeighborsRegressor(n_neighbors=1)
k10_knn.fit(X_train_KBest, y_train)
k10_knn_pred = k10_knn.predict(X_test_KBest)

performances.loc["KBest_10", "decision_tree"]= r2_score(y_test, k10_tree_pred)
performances.loc["KBest_10", "knn"] = r2_score(y_test, k10_knn_pred)

performances

Unnamed: 0,decision_tree,knn
baseline,0.635876,0.763599
varThreshold_0_01,0.795512,0.635966
collinearity,0.790468,0.635966
KBest_10,0.806005,0.698444


# Recursive Feature Elimination

In [None]:
# Separate numeric and categorical columns in your training set
X_train_numeric = X_train.select_dtypes(include=['number'])
X_train_categorical = X_train.select_dtypes(exclude=['number'])

# Impute missing values for numeric columns
numeric_imputer = SimpleImputer(strategy='mean')
X_train_numeric_imputed = numeric_imputer.fit_transform(X_train_numeric)

# Separate numeric and categorical columns in your test set
X_test_numeric = X_test.select_dtypes(include=['number'])

# Impute missing values for numeric columns in the test set
X_test_numeric_imputed = numeric_imputer.transform(X_test_numeric)

rfe_tree = RFECV(DecisionTreeRegressor())
rfe_tree.fit(X_train_numeric_imputed, y_train)
rfe_tree_pred = rfe_tree.predict(X_test_numeric_imputed)

performances.loc["RFE", "decision_tree"] = r2_score(y_test, rfe_tree_pred)

performances.head(20)

Unnamed: 0,decision_tree,knn
baseline,0.635876,0.763599
varThreshold_0_01,0.795512,0.635966
collinearity,0.790468,0.635966
KBest_10,0.806005,0.698444
RFE,0.795008,


In [None]:
# Transform the training and test sets
#X_train_preprocessed = preprocessor.fit_transform(X_train)
#X_test_preprocessed = preprocessor.transform(X_test)

# Create RFECV with KNeighborsRegressor and use neg_mean_squared_error as the scoring method
#rfe_knn = RFECV(KNeighborsRegressor(), scoring='neg_mean_squared_error')

# Fit RFECV on the preprocessed training data
#rfe_knn.fit(X_train_preprocessed, y_train)

# Predict using the selected features on the preprocessed test data
#rfe_knn_pred = rfe_knn.predict(X_test_preprocessed)

# Calculate R-squared score based on the mean squared error
#mse = mean_squared_error(y_test, rfe_knn_pred)
#r2 = 1 - (mse / np.var(y_test))

# Update the performances DataFrame with the R-squared score
#performances.loc["RFE", "knn"] = r2

# Transform the training and test sets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Create RFECV with DecisionTreeRegressor and use neg_mean_squared_error as the scoring method
rfe_dtree = RFECV(DecisionTreeRegressor(), scoring='neg_mean_squared_error')

# Fit RFECV on the preprocessed training data
rfe_dtree.fit(X_train_preprocessed, y_train)

# Get the selected features
selected_features = rfe_dtree.support_

# Apply feature selection to the training and test sets
X_train_selected = X_train_preprocessed[:, selected_features]
X_test_selected = X_test_preprocessed[:, selected_features]

# Fit KNeighborsRegressor on the selected features
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train_selected, y_train)

# Predict using the selected features on the preprocessed test data
knn_pred = knn_regressor.predict(X_test_selected)

# Calculate R-squared score
r2 = r2_score(y_test, knn_pred)

# Update the performances DataFrame with the R-squared score
performances.loc["RFE", "knn"] = r2




performances

Unnamed: 0,decision_tree,knn
baseline,0.635876,0.763599
varThreshold_0_01,0.795512,0.635966
collinearity,0.790468,0.635966
KBest_10,0.806005,0.698444
RFE,0.795008,0.703515


Great news! The model's performance has increased once more!

To extract the names of the features selected by RFE, you can refer to the `rfe_tree` object.

In [None]:
rfe_tree.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19',
       'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28',
       'x29', 'x30', 'x31', 'x32', 'x33', 'x34', 'x35'], dtype=object)

# Select from model


In [None]:

# Define numeric and categorical feature columns
numeric_features = X_train.select_dtypes(include=['number']).columns
categorical_features = X_train.select_dtypes(exclude=['number']).columns

# Create a ColumnTransformer to handle numeric and categorical features separately
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize numeric features
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features
])

# Combine transformers using a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the SelectFromModel with DecisionTreeRegressor
select_model_tree = SelectFromModel(DecisionTreeRegressor(), threshold=None)

# Create a pipeline that includes feature selection and preprocessing
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('select_model_dtree_reg', select_model_tree)
])

# Fit the pipeline on the training set
X_train_selected_model_tree = pipeline.fit_transform(X_train, y_train)

# Transform the test set using the same pipeline
X_test_selected_model_tree = pipeline.transform(X_test)

# Convert the transformed data back to a DataFrame if needed
#X_train_selected_model_tree = pd.DataFrame(X_train_selected_model_tree)
#X_test_selected_model_tree = pd.DataFrame(X_test_selected_model_tree)


In [None]:
# Show shape of the train and test dataset to check for the number of features kept.
print(X_train_selected_model_tree.shape, X_test_selected_model_tree.shape)

(1168, 19) (292, 19)


Once again, only three features were retained. When different methods converge towards the same output, it instills confidence in the selection, indicating a robust choice.

In [None]:
select_model_tree.get_feature_names_out()

array(['x1', 'x2', 'x3', 'x4', 'x5', 'x8', 'x11', 'x12', 'x13', 'x15',
       'x24', 'x25', 'x26', 'x27', 'x34', 'x192', 'x206', 'x217', 'x243'],
      dtype=object)

We can now try how both the Decision Tree and the KNN perform with these three features:

In [None]:
# Decision tree.
select_model_tree = DecisionTreeRegressor()
select_model_tree.fit(X_train_selected_model_tree, y_train)
select_model_tree_pred = select_model_tree.predict(X_test_selected_model_tree)

# K-Nearest Neighbors.
select_model_knn = KNeighborsRegressor(n_neighbors=1)
select_model_knn.fit(X_train_selected_model_tree, y_train)
select_model_knn_pred = select_model_knn.predict(X_test_selected_model_tree)

performances.loc["model_selected", "decision_tree"] = r2_score(y_test, select_model_tree_pred)
performances.loc["model_selected", "knn"] = r2_score(y_test, select_model_knn_pred)

performances

Unnamed: 0,decision_tree,knn
baseline,0.635876,0.763599
varThreshold_0_01,0.795512,0.635966
collinearity,0.790468,0.635966
KBest_10,0.806005,0.698444
RFE,0.795008,0.703515
model_selected,0.781723,0.866113


# Regression Models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV



In [1]:
# Define numeric and categorical feature columns

numeric_features = X_train.select_dtypes(include=['number']).columns
categorical_features = X_train.select_dtypes(exclude=['number']).columns

# Define transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))  # You can use 'median' or other strategies as needed
])

#categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # You can choose a different strategy
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor that applies the transformers to appropriate columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),  # Replace 'numeric_features' with your numeric feature names
        ('cat', categorical_transformer, categorical_features)  # Replace 'categorical_features' with your categorical feature names
    ])

# Create the final pipeline with preprocessing and the KNeighborsRegressor

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', KNeighborsRegressor())])

# Define hyperparameters for GridSearchCV

param_grid = {
    "regressor__n_neighbors": range(2, 14, 2),  # Adjust the range as needed
    "regressor__weights": ["uniform", "distance"]
}

# Note: 'greater_is_better=False' because lower MSLE is better

msle_scorer = make_scorer(mean_squared_log_error, greater_is_better=False)

# Create GridSearchCV

grid_search = GridSearchCV(pipeline, param_grid,msle_scorer, cv=5) #scoring="r2"#scoring=msle_scorer

# Fit the model to the training data

grid_search.fit(X_train, y_train)

# Predict on the test set

select_model_knn_pred = grid_search.predict(X_test)

# Calculate R-squared score for KNeighborsRegressor

performance_knn = r2_score(y_test, select_model_knn_pred)

# Add the performance metric to your 'performances' DataFrame

performances.loc["model_selected_GridSearchCV", "knn"] = performance_knn

# Display the performance metrics

performances


In [None]:
# Define numeric and categorical feature columns

#numeric_features = X_train.select_dtypes(include=['number']).columns
#categorical_features = X_train.select_dtypes(exclude=['number']).columns

# Define transformers for numeric and categorical columns
#numeric_transformer = Pipeline(steps=[
 #   ('imputer', SimpleImputer(strategy='mean'))  # You can use 'median' or other strategies as needed
#])

#categorical_transformer = Pipeline(steps=[
 #   ('imputer', SimpleImputer(strategy='most_frequent')),  # You can choose a different strategy
 #   ('onehot', OneHotEncoder(handle_unknown='ignore'))
#])

# Create a preprocessor that applies the transformers to appropriate columns
#preprocessor = ColumnTransformer(
 #   transformers=[
  #      ('num', numeric_transformer, numeric_features),  # Replace 'numeric_features' with your numeric feature names
  #      ('cat', categorical_transformer, categorical_features)  # Replace 'categorical_features' with your categorical feature names
  #  ])

# Create the final pipeline with preprocessing and the DecisionTreeRegressor
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DecisionTreeRegressor())])

# Define hyperparameters for GridSearchCV
#param_grid = {
 #   'regressor__max_depth': [None, 10, 20],  # You can adjust other hyperparameters as well
#}
param_grid = {
    "regressor__max_depth": range(2, 14, 2),
    "regressor__min_samples_leaf": range(3, 12, 2)  # Modify min_samples_leaf range
}

msle_scorer = make_scorer(mean_squared_log_error, greater_is_better=False)

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid,scoring=msle_scorer, cv=5)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Predict on the test set
select_model_tree_pred = grid_search.predict(X_test)

# Calculate R-squared score for DecisionTreeRegressor
performance_tree = r2_score(y_test, select_model_tree_pred)

# Add the performance metric to your 'performances' DataFrame
performances.loc["model_selected_GridSearchCV", "decision_tree"] = performance_tree


performances

In [None]:
# Now let's use mean squared log error

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer, mean_squared_log_error

# Assuming 'preprocessor' is already defined and includes your preprocessing steps
pipeline = make_pipeline(
    preprocessor,
    DecisionTreeRegressor()
)

# Define the parameter grid
param_grid = {
    "decisiontreeregressor__max_depth": range(2, 14, 2),
    "decisiontreeregressor__min_samples_leaf": range(3, 12, 2)
}

# Create a scorer for mean squared log error
# Note: 'greater_is_better=False' because lower MSLE is better
msle_scorer = make_scorer(mean_squared_log_error, greater_is_better=False)

# Initialize GridSearchCV with the MSLE scorer
search = GridSearchCV(
    pipeline, param_grid, cv=5, verbose=1, scoring=msle_scorer
)

# Fit the GridSearchCV to your training data
search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = search.best_params_
print("Best Hyperparameters:", best_params)


In [None]:
# 2. fit the search
search.fit(X_train, y_train)



In [None]:
# 3. predict with search
y_train_pred = search.predict(X_train)

In [None]:
from sklearn.metrics import r2_score

# Calculate R-squared (R2) score
dt_r2 = r2_score(y_true=y_train, y_pred=y_train_pred)

dt_r2

In [None]:
# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['number']).columns
categorical_cols = X_train.select_dtypes(exclude=['number']).columns

# Create the preprocessing pipelines for both numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [None]:
# Use the  Linear Regression Model with mean squared log error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_log_error, r2_score

# Assuming 'preprocessor' is defined as per your dataset requirements
lr_pipeline = make_pipeline(
    preprocessor,
    LinearRegression()
)

# As LinearRegression has no hyperparameters for tuning, we use an empty param_grid
param_grid = {}

# Create a scorer for mean squared error (MSE)
#mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Initialize GridSearchCV with the MSE scorer
lr_search = GridSearchCV(lr_pipeline, param_grid={}, cv=3, verbose=1)#scoring=mse_scorer,

# Create a scorer for mean squared log error
#msle_scorer = make_scorer(mean_squared_log_error, greater_is_better=False)

# Initialize GridSearchCV with the MSLE scorer
#lr_search = GridSearchCV(lr_pipeline, param_grid, cv=5, verbose=1, scoring=msle_scorer)

# Fit the search
lr_search.fit(X_train, y_train)

# Predict with the best model from the search
y_train_pred = lr_search.predict(X_train)

# Calculate R-squared (R2) score
lr_r2 = r2_score(y_train, y_train_pred)

print("R-squared (R2) Score:", lr_r2)


In [None]:
# Try Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Update the pipeline to use RandomForestRegressor
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor())])

# Define hyperparameters for GridSearchCV (for RandomForestRegressor)
param_grid = {
    "regressor__n_estimators": [100, 200],  # Number of trees in the forest
    "regressor__max_depth": [None, 10, 20],  # Maximum depth of the trees
}

# Create GridSearchCV
grid_search_rf = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the model to the training data
grid_search_rf.fit(X_train, y_train)


from sklearn.metrics import r2_score

# Fit the GridSearchCV to your training data
grid_search_rf.fit(X_train, y_train)

# Predict with the best model from the search on the training data
y_train_pred_rf = grid_search_rf.predict(X_train)

# Calculate R-squared (R2) score for RandomForestRegressor
rf_r2 = r2_score(y_train, y_train_pred_rf)

print("R-squared (R2) Score:", rf_r2)


R-squared (R2) Score: 0.9789432511366413


# Prepare data for Kaggle Data Platform

Read the Kaggle Paltform instructions https://www.kaggle.com/

In [None]:
#import pandas as pd
#from sklearn.model_selection import train_test_split

# Load your data
#url = "https://drive.google.com/file/d/1-PrhRxZgo-UOFKTWV7AAE7poam6pJ2wv/view?usp=drive_link" # regression_model
#path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
#data = pd.read_csv(path)

# Split data into features and target
#y = data.pop("SalePrice") # SalePrice is what we want to predict
#X = data
# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create new DataFrames for train and test sets with only 'Id' and 'SalePrice'
#train_df = pd.DataFrame({'Id': X_train['Id'], 'SalePrice': y_train})
#test_df = pd.DataFrame({'Id': X_test['Id'], 'SalePrice': y_test})

# Export to CSV
#train_df.to_csv('train.csv', index=False)
#test_df.to_csv('test.csv', index=False)


In [None]:
# Load the  train.csv and  test.csv files

#train = pd.read_csv('/content/train.csv', index_col="Id")

#test = pd.read_csv('/content/test.csv', index_col="Id")


In [None]:
test_url = "https://drive.google.com/file/d/1MZnPvWoGQtBHij32Rti26C2T0KT1xGBc/view?usp=drive_link"
test_path = 'https://drive.google.com/uc?export=download&id='+test_url.split('/')[-2]
test = pd.read_csv(test_path)

In [None]:
#test_preprocessed = preprocessor.transform(test)
#test_selected = test_preprocessed[:, selected_features]
#test_predictions = knn_regressor.predict(test_selected)
#test['SalePrice'] = test_predictions

In [None]:
#test["SalePrice"] = select_model_tree.predict(test)
test_predictions = grid_search_rf.predict(test)
# Add the predictions to the test DataFrame
test['SalePrice'] = test_predictions


In [None]:
test = test.reset_index()

In [None]:
test[["Id", "SalePrice"]].to_csv("./submission_10.csv", index=False)