In [19]:
%pip install boruta pmdarima openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from itertools import product
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy as bp
import os
from pmdarima.arima import auto_arima
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score

pd.set_option("display.max_columns", 500)

np.random.seed(2137) #uncomment if you want your code to be reproducible; for the purposes of our activity, let's add some randomness to the results

In [6]:
# log version
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train = df_train.drop(columns=['Unnamed: 0', 'price_z'])
df_test = df_test.drop(columns=['Unnamed: 0', 'price_z'])

# Separate features and target
X_train = df_train.drop(['log_price'], axis=1)
y_train = df_train['log_price']

X_test = df_test.drop(['log_price'], axis=1)
y_test = df_test['log_price']

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Lasso regression with cross-validation
lasso = LassoCV(cv=5, random_state=420)
lasso.fit(X_train_scaled, y_train)

In [9]:
# Elastic Net regression with cross-validation
elastic = ElasticNetCV(cv=5, random_state=420)
elastic.fit(X_train_scaled, y_train)

In [10]:
# Lasso predictions & evaluation
y_pred_lasso = lasso.predict(X_test_scaled)
print("Lasso RMSE:", mean_squared_error(y_test, y_pred_lasso, squared=False))
print("Lasso R2:", r2_score(y_test, y_pred_lasso))

# Elastic Net predictions & evaluation
y_pred_elastic = elastic.predict(X_test_scaled)
print("Elastic Net RMSE:", mean_squared_error(y_test, y_pred_elastic, squared=False))
print("Elastic Net R2:", r2_score(y_test, y_pred_elastic))

Lasso RMSE: 0.16112945115147603
Lasso R2: 0.8979471137965765
Elastic Net RMSE: 0.1611245148504549
Elastic Net R2: 0.8979533666081665




In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_regression, f_regression
from sklearn.linear_model import ElasticNetCV
from scipy.stats import pearsonr

# Assume X_train (dataframe) and y_train (series) are already defined and numeric only

feature_names = X_train.columns

# 1. Mutual Information Score
mi_scores = mutual_info_regression(X_train, y_train, random_state=420)

# 2. F-score (ANOVA)
f_scores, f_pvalues = f_regression(X_train, y_train)

# 3. Significance of F-score (0/1 if p < 0.05)
sign_fscore_0_1 = (f_pvalues < 0.05).astype(int)

# 4. Correlation with target (Pearson)
corrs = [pearsonr(X_train[col], y_train)[0] for col in feature_names]

# 5. ElasticNet Coefficients (fit on standardized X!)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

elastic = ElasticNetCV(cv=5, random_state=42)
elastic.fit(X_train_scaled, y_train)
en_coefs = elastic.coef_

# 6. Boruta (optional, needs extra package & can be slow)
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
boruta_selector = BorutaPy(RandomForestRegressor(n_jobs=-1, random_state=42), n_estimators='auto', random_state=42)
boruta_selector.fit(X_train.values, y_train.values)
boruta_rank = boruta_selector.ranking_

# Instead, make a dummy column for Boruta if not using:
#boruta_rank = [np.nan] * len(feature_names)

In [16]:
summary_table = pd.DataFrame({
    "mi_score": mi_scores,
    "sign_fscore": f_scores,
    "sign_fscore_0_1": sign_fscore_0_1,
    "corr": corrs,
    "EN_coef": en_coefs,
    "boruta_rank": boruta_rank
}, index=feature_names)

summary_table = summary_table.sort_values(["mi_score", "boruta_rank", "corr"], ascending=False)
print(summary_table)

                         mi_score    sign_fscore  sign_fscore_0_1      corr  \
market_volatility        1.656689  595561.938796                1  0.909032   
dim_m2                   0.184114   50639.881228                1  0.536705   
n_rooms                  0.131669   35917.879953                1  0.472211   
year_built               0.119453    3011.185439                1  0.153275   
loc_code_693f303c        0.093735   23217.771575                1  0.395571   
...                           ...            ...              ...       ...   
1900_1920                0.000801       2.066857                0  0.004064   
obj_type_other           0.000348      44.439758                1  0.018840   
1940_1950                0.000171      20.753242                1 -0.012876   
own_type_4e625087        0.000017       0.403948                0  0.001797   
neighborhood_crime_rate  0.000000       0.051691                0 -0.000643   

                          EN_coef  boruta_rank  
ma

In [20]:
summary_table.to_excel("feature_ranking.xlsx")