In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, TimeSeriesSplit
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.metrics import f1_score, mean_squared_error, r2_score, root_mean_squared_log_error

In [3]:
test_df = pd.read_json("data/train.json")

level_map = {'low': 1, 'medium': 2, 'high': 3}
test_df['interest_level'] = test_df['interest_level'].map(level_map)
X = test_df
y = test_df['price']

Взято из прошлого МЛ проекта и является препроцессингом

In [4]:
features = []
for i, r in X.iterrows():
    for j in r['features']:
        features.append(j)

feature_set = set(features)
print(f'unique values - {len(feature_set)}.\nFeatures  -  {feature_set}')

top_20 = Counter(features).most_common(20)

top_20_feature_names = []
for f, c in top_20:
    top_20_feature_names.append(f)
    X[f] = 0

for index, row in X.iterrows():
    for feat in row['features']:
        if feat in top_20_feature_names:
            X.at[index, feat] = 1

X_feat = test_df[['bathrooms', 'bedrooms', 'interest_level'] + list(top_20_feature_names)]
X_feat

unique values - 1556.
Features  -  {'Pet grooming room', "** BKLYN'S FINEST! * MASSIVE TRUE 2BR * COURTYARD * FULLY RENOVATED * 2 BLKS TO BEDFORD L STOP **", 'Harwood Floors', 'BIG OPEN SPACES', 'Bright & Sunny', 'Unassigned Paid parking available', 'ACT FAST WILL NOT LAST', 'Garage', '** NO FEE BKLN LUXURY! * EXPANSIVE 1BR w/HOME OFFICE * EAT-IN KITCHEN * FULL RENOVATED * 1 BLK TO THE PARK * STEPS TO L TRAIN **', 'Granite countertops', 'assigned-parking-space', '** RARE FIND ~ UNIQUE CARRIAGE HOUSE * MASSIVE TRUE 2BR * PRIVATE GARDEN * WASHER/DRYER * 2 BLKS TO L TRAIN **', 'Garden/Patio', 'Garage Parking', 'Storage Facilities available', 'water view', 'ACT FAST WONT LAST', 'Heat Included', 'NMW', 'RENOVATED 1 Bed', 'Hardwood Floors', 'walk in closet', 'Duplex', 'golf simulator', 'Free WiFi in Club lounge', 'Fitness Center', 'Sorry no pets', 'Huge Walk In Closet', 'Shared yard', 'Pilates and training room.', 'Walk-in-closet', 'Community recreation facilities', 'Recessed Lighting', 'HIG

Unnamed: 0,bathrooms,bedrooms,interest_level,Elevator,Cats Allowed,Hardwood Floors,Dogs Allowed,Doorman,Dishwasher,No Fee,...,Laundry in Unit,Roof Deck,Outdoor Space,Dining Room,High Speed Internet,Balcony,Swimming Pool,Laundry In Building,New Construction,Terrace
4,1.0,1,2,0,1,1,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
6,1.0,2,1,1,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9,1.0,2,2,1,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
10,1.5,3,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,1.0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124000,1.0,3,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
124002,1.0,2,2,1,1,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
124004,1.0,1,2,1,1,1,1,0,1,1,...,1,0,0,1,0,0,0,0,0,0
124008,1.0,2,2,0,0,0,0,0,1,1,...,1,0,1,0,0,0,0,0,0,0


## 3RD Part

In [5]:
def chad_train_test_split(X, y, test_size=0.2):
    n = X.shape[0]
    n_test = int(n * test_size)
    n_train = n - n_test

    X_train = X.iloc[:n_train]
    y_train = y.iloc[:n_train]

    X_test = X.iloc[n_train:]
    y_test = y.iloc[n_train:]

    return X_train, X_test, y_train, y_test

In [6]:
def chat_train_test_valid_split(X, y, test_size=0.2, valid_size=0.2):
    n = X.shape[0]
    n_test = int(n * test_size)
    n_valid = int(n * valid_size)
    n_train = n - n_test - n_valid

    X_train = X.iloc[:n_train]
    y_train = y.iloc[:n_train]

    X_valid = X.iloc[n_train:n_train + n_valid]
    y_valid = y.iloc[n_train:n_train + n_valid]

    X_test = X.iloc[n_train + n_valid:]
    y_test = y.iloc[n_train + n_valid:]

    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [7]:
def split_by_date(data, date_column, date_split):
    """
    Split data into training and testing sets based on a date.

    Parameters:
    data (pd.DataFrame): The input data.
    date_column (str): The name of the column containing date values.
    date_split (str): The date to split the data on (format: 'YYYY-MM-DD').

    Returns:
    pd.DataFrame, pd.DataFrame: The training and testing sets.
    """
    # Convert date_split to a pandas datetime object
    date_splited = pd.to_datetime(date_split)
    
    # Split the data
    train_data = data[data[date_column] < date_splited]
    test_data = data[data[date_column] >= date_splited]
    
    return train_data, test_data

In [8]:
def split_by_dates(data, date_column, validation_date, test_date):
    """
    Split data into training, validation, and testing sets based on dates.

    Parameters:
    data (pd.DataFrame): The input data.
    date_column (str): The name of the column containing date values.
    validation_date (str): The date to split the training and validation data (format: 'YYYY-MM-DD').
    test_date (str): The date to split the validation and testing data (format: 'YYYY-MM-DD').

    Returns:
    pd.DataFrame, pd.DataFrame, pd.DataFrame: The training, validation, and testing sets.
    """
    # Convert validation_date and test_date to pandas datetime objects
    validation_date = pd.to_datetime(validation_date)
    test_date = pd.to_datetime(test_date)
    
    # Split the data
    train_data = data[data[date_column] < validation_date]
    validation_data = data[(data[date_column] >= validation_date) & (data[date_column] < test_date)]
    test_data = data[data[date_column] >= test_date]
    
    return train_data, validation_data, test_data

## 4TH Part - KFold

In [9]:
def real_KFold(data, k=5):
    indices = []
    n = data.shape[0]
    fold_size = n // k
    
    for i in range(k):
        test_indices = list(range(i * fold_size, (i + 1) * fold_size))
        
        train_indices = list(set(range(n)) - set(test_indices))
        
        indices.append((np.array(train_indices), np.array(test_indices)))
    
    return np.array(indices, dtype=object)  # Convert the list to an ndarray

## Comparision KFold vs Mine

In [10]:
print("Original KFold")
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    print(type(X_train), type(train_index))
    print(f"Train ind: {train_index}, Test ind: {test_index}")

print("Custom KFold")
for train_index, test_index in real_KFold(X, k=10):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    print(type(X_train), type(train_index))
    print(f"Train ind: {train_index}, Test ind: {test_index}")

Original KFold
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
Train ind: [ 4936  4937  4938 ... 49349 49350 49351], Test ind: [   0    1    2 ... 4933 4934 4935]
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
Train ind: [    0     1     2 ... 49349 49350 49351], Test ind: [4936 4937 4938 ... 9869 9870 9871]
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
Train ind: [    0     1     2 ... 49349 49350 49351], Test ind: [ 9872  9873  9874 ... 14804 14805 14806]
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
Train ind: [    0     1     2 ... 49349 49350 49351], Test ind: [14807 14808 14809 ... 19739 19740 19741]
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
Train ind: [    0     1     2 ... 49349 49350 49351], Test ind: [19742 19743 19744 ... 24674 24675 24676]
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
Train ind: [    0     1     2 ... 49349 49350 49351], Test ind: [24677 24678 24679 ... 29609 29

## Comparison GroupKFold

In [11]:
def real_GroupKFold(data, group_field, k=5):
    groups = data[group_field]
    group_kfold = GroupKFold(n_splits=k)
    
    indices = []
    for train_index, test_index in group_kfold.split(data, groups=groups):
        indices.append((train_index, test_index))
    
    return indices

In [12]:
folds = real_GroupKFold(X.head(10), group_field='bathrooms', k=5)

print("Custom GroupKFold")
for fold, (train_i, test_i) in enumerate(folds):
    print(train_i, test_i)

print("Original GroupKFold")
folds = GroupKFold(n_splits=5).split(X.head(10), groups=X.head(10)['bathrooms'])
for fold, (train_i, test_i) in enumerate(folds):
    print(train_i, test_i)

Custom GroupKFold
[3 6 8 9] [0 1 2 4 5 7]
[0 1 2 3 4 5 6 7 8] [9]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 4 5 6 7 9] [8]
Original GroupKFold
[3 6 8 9] [0 1 2 4 5 7]
[0 1 2 3 4 5 6 7 8] [9]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 4 5 6 7 9] [8]


## Comparison StratifiedKFold

In [13]:
def real_StratifiedKFold(stratify_field, k=5):
    X = stratify_field['bathrooms']
    y = stratify_field['price']
    skf = StratifiedKFold(n_splits=k)
    
    indices = []
    for train_index, test_index in skf.split(X, y):
        indices.append((train_index, test_index))
    
    return indices

In [14]:
print("Original StratifiedKFold")
skf = StratifiedKFold(n_splits=3)
test_strat = []
for train_i, test_i in skf.split(X['bathrooms'].head(100), y.head(100)):
    # print(train_i, test_i)
    # print(X.iloc[train_i], X.iloc[test_i])
    test_strat.append(train_i)
    test_strat.append(test_i)
    test_strat.append(X.iloc[train_i])
    test_strat.append(X.iloc[test_i])
    break

print("Custom StratifiedKFold")
skf = real_StratifiedKFold(test_df.head(100), k=3)
for train_i, test_i in skf:
    # print(train_i, test_i)
    # print(X.iloc[train_i], X.iloc[test_i])
    test_strat.append(train_i)
    test_strat.append(test_i)
    test_strat.append(X.iloc[train_i])
    test_strat.append(X.iloc[test_i])
    break

for i in range(4):
    if i < 2:  # Сравнение массивов индексов
        if not np.array_equal(test_strat[i], test_strat[i + 4]):
            print(False)
        else:
            print(True)
    else:  # Сравнение DataFrame
        if not test_strat[i].equals(test_strat[i + 4]):
            print(False)
        else:
            print(True)

Original StratifiedKFold
Custom StratifiedKFold
True
True
True
True




## Comparison TimeSeriesSplit

In [15]:
def real_TimeSeriesSplit(date_field, k):
    indices = []
    n = date_field.shape[0]
    actual_size = k + 1
    fold_size = n // actual_size
    for i in range(1, actual_size):
        test_indices = list(range(0, i * fold_size))
        train_indices = list(range(i * fold_size, (i + 1) * fold_size))
        indices.append((np.array(test_indices), np.array(train_indices)))
    return np.array(indices, dtype=object)

In [16]:
print("Original TimeSeriesSplit")
tscv = TimeSeriesSplit(n_splits=4)
for train_i, test_i in tscv.split(X.head(100)):
    print(train_i, test_i)
    print("next iter")
    # print(X.iloc[train_i], X.iloc[test_i])
    # break

print("Custom TimeSeriesSplit")
custom_tscv = real_TimeSeriesSplit(X.head(100), 4)
for train_i, test_i in custom_tscv:
    print(train_i, test_i)
    print("next iter")
    # print(X.iloc[train_i], X.iloc[test_i])
    # break

Original TimeSeriesSplit
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
next iter
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39] [40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59]
next iter
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59] [60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79]
next iter
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79] [80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99]
next iter
Custom TimeSeriesSplit
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 1

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

In [18]:
X.columns

for col in X.columns:
    if X[col].apply(lambda x: isinstance(x, list)).any():
        print(f"Column '{col}' contains lists.")
    if X[col].apply(lambda x: isinstance(x, str)).all():
        print(f"Column '{col}' contains strings.")

# drop features coz we have them already by onehotencoder
X.drop(columns=['features'], inplace=True)
X.drop(columns=['building_id', 'description', 'created', 'display_address', 'street_address'], inplace=True)
X['manager_id'] = X['manager_id'].apply(lambda x: "manager" if x == '0' or x =='_' or x == '' else "no manager")
# preprocess photos to numerical category is they are just exist
X['photos'] = X['photos'].apply(lambda x: 1 if isinstance(x, list) and len(x) > 0 else 0)

Column 'building_id' contains strings.
Column 'created' contains strings.
Column 'description' contains strings.
Column 'display_address' contains strings.
Column 'features' contains lists.
Column 'manager_id' contains strings.
Column 'photos' contains lists.
Column 'street_address' contains strings.


In [19]:
# list_columns = [col for col in X.columns if X[col].apply(lambda x: isinstance(x, list)).any()]
# for col in list_columns:
#     mlb = MultiLabelBinarizer()
#     # Limit the number of unique elements to encode
#     unique_values = X[col].explode().value_counts().index[:5]  # Keep top 10 unique values
#     filtered_col = X[col].apply(lambda x: [v for v in x if v in unique_values] if isinstance(x, list) else x)
#     encoded = mlb.fit_transform(filtered_col.dropna())
#     encoded_df = pd.DataFrame(encoded, columns=[f"{col}_{cls}" for cls in mlb.classes_])
#     X = pd.concat([X.drop(columns=[col]), encoded_df], axis=1)

categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['number']).columns

# Preprocessing pipelines
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),  # Specify numerical columns
        ('cat', categorical_transformer, categorical_columns)  # Specify categorical columns
    ]
)

# Fit the preprocessor and transform the data
X_preprocessed = preprocessor.fit_transform(X.head(1000))

# Get feature names for categorical columns
# Access the fitted OneHotEncoder from the preprocessor
onehot_encoder = preprocessor.named_transformers_['cat']['onehot']
cat_feature_names = onehot_encoder.get_feature_names_out(categorical_columns)

# Combine numerical and categorical feature names
preprocessed_columns = (
    numerical_columns.tolist() +
    cat_feature_names.tolist()
)

# Convert the preprocessed data to a DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=preprocessed_columns)

In [20]:
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
validation_schemes = {
    "KFold": real_KFold(X.head(1000), k=5),
    "GroupKFold": real_GroupKFold(X.head(1000), group_field='bathrooms', k=5),
    "StratifiedKFold": real_StratifiedKFold(test_df.head(1000), k=5),
    "TimeSeriesSplit": real_TimeSeriesSplit(X.head(1000), 5)
}
avg_mse_for_scheme = []
for name, scheme in validation_schemes.items():
    error = 0
    print(f"Validation scheme: {name}")
    fold = 1
    for train_i, test_i in scheme:
        X_train, X_test = X_preprocessed.iloc[train_i], X_preprocessed.iloc[test_i]
        y_train, y_test = y.iloc[train_i], y.iloc[test_i]

        elastic_net.fit(X_train, y_train)
        y_pred = elastic_net.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        error += mse
        print(f"Fold {fold}, MSE: {mse}")
        fold += 1
    print(f"Average MSE: {error / len(scheme)}")
    avg_mse_for_scheme.append((error / len(scheme), name))

best_option = sorted(avg_mse_for_scheme, key=lambda x: x[0])[0][1]
print("The best option is ", best_option)

Validation scheme: KFold
Fold 1, MSE: 6.003599429544046e-10
Fold 2, MSE: 1.520725425247479e-08
Fold 3, MSE: 7.572507102034844e-10
Fold 4, MSE: 9.105640584207847e-10
Fold 5, MSE: 1.9777550665528768e-07
Average MSE: 4.305018712386823e-08
Validation scheme: GroupKFold
Fold 1, MSE: 6.929425174717352e-07
Fold 2, MSE: 2.5134654045929735e-09
Fold 3, MSE: 6.83517443538594e-10
Fold 4, MSE: 1.4216654285484724e-08
Fold 5, MSE: 1.3503854242650193e-08
Average MSE: 1.4477200176960034e-07
Validation scheme: StratifiedKFold
Fold 1, MSE: 7.658440178325836e-10
Fold 2, MSE: 2.136506902885859e-08
Fold 3, MSE: 8.817675624683919e-10
Fold 4, MSE: 6.199201640286254e-07
Fold 5, MSE: 3.8830016665424227e-10
Average MSE: 1.2866422896088786e-07
Validation scheme: TimeSeriesSplit
Fold 1, MSE: 1.6366197928756758e-07
Fold 2, MSE: 6.202485136175875e-08
Fold 3, MSE: 4.799979743939862e-09
Fold 4, MSE: 4.850711561428246e-08
Fold 5, MSE: 8.162234405601605e-10
Average MSE: 5.5962029889621765e-08
The best option is  KFold




In [21]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler


In [22]:
X_preprocessed.drop(columns=['price'], inplace=True)
X.drop(columns=['price'], inplace=True)

In [23]:
X_train, X_valid, X_test, y_train, y_valid, y_test = chat_train_test_valid_split(X_preprocessed.head(1000), y.head(1000), test_size=0.2, valid_size=0.2)
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.fit_transform(X_valid)
X_test_scaled = scaler.transform(X_test)

lasso = Lasso(alpha=0.1, random_state=21)
lasso.fit(X_train_scaled, y_train)

y_valid_pred = lasso.predict(X_valid_scaled)
valid_rmse = root_mean_squared_log_error(y_valid, y_valid_pred)
valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

print(f"Validation RMSE: {valid_rmse:.4f}")
print(f"Validation MSE: {valid_mse:.4f}")
print(f"Validation R²: {valid_r2:.4f}")

Validation RMSE: 0.3344
Validation MSE: 1731752.4244
Validation R²: 0.5511


In [24]:
X_train.columns

Index(['bathrooms', 'bedrooms', 'latitude', 'listing_id', 'longitude',
       'photos', 'interest_level', 'Elevator', 'Cats Allowed',
       'Hardwood Floors', 'Dogs Allowed', 'Doorman', 'Dishwasher', 'No Fee',
       'Laundry in Building', 'Fitness Center', 'Pre-War', 'Laundry in Unit',
       'Roof Deck', 'Outdoor Space', 'Dining Room', 'High Speed Internet',
       'Balcony', 'Swimming Pool', 'Laundry In Building', 'New Construction',
       'Terrace', 'manager_id_no manager'],
      dtype='object')

In [25]:
feat_names = X_train.columns
coefficients = lasso.coef_

sorted_features = sorted(
    zip(feat_names, coefficients),
    key=lambda x: abs(x[1]),
    reverse=True
)

top_10_features = [feature for feature, coef in sorted_features[:10]]
print("\nTop 10 features:", top_10_features)


Top 10 features: ['bedrooms', 'bathrooms', 'interest_level', 'Doorman', 'Fitness Center', 'Laundry in Unit', 'Hardwood Floors', 'No Fee', 'Cats Allowed', 'High Speed Internet']


In [26]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_valid_scaled_df = pd.DataFrame(X_valid_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_train.columns)

# Subset the top 10 features
X_train_top10 = X_train_scaled_df[top_10_features]
X_valid_top10 = X_valid_scaled_df[top_10_features]
X_test_top10 = X_test_scaled_df[top_10_features]

In [27]:
lasso.fit(X_train_top10, y_train)
y_valid_pred = lasso.predict(X_valid_top10)
valid_rmse = root_mean_squared_log_error(y_valid, y_valid_pred)
valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

print(f"Validation RMSE: {valid_rmse:.4f}")
print(f"Validation MSE: {valid_mse:.4f}")
print(f"Validation R²: {valid_r2:.4f}")

Validation RMSE: 0.3172
Validation MSE: 1657191.2126
Validation R²: 0.5704


In [28]:
def feature_selection_by_nan_and_correlation(X, y, nan_threshold=0.5, top_n=10):
    """
    Select features based on NaN ratio and correlation with the target variable.

    Parameters:
    - X: DataFrame, feature set
    - y: Series, target variable
    - nan_threshold: float, maximum allowed NaN ratio for a feature
    - top_n: int, number of top features to select based on correlation

    Returns:
    - selected_features: list of top feature names
    """
    # Filter features by NaN ratio
    nan_ratios = X.isna().mean()
    valid_features = nan_ratios[nan_ratios <= nan_threshold].index

    #  Filter features by correlation with the target
    correlations = X[valid_features].corrwith(y).abs()
    top_features = correlations.nlargest(top_n).index

    return top_features.tolist()

X_train, X_valid, X_test, y_train, y_valid, y_test = chat_train_test_valid_split(
    X_preprocessed.head(1000), y.head(1000), test_size=0.2, valid_size=0.2
)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_valid_scaled = pd.DataFrame(scaler.transform(X_valid), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

top_10_features = feature_selection_by_nan_and_correlation(X_train, y_train, nan_threshold=0.5, top_n=10)
print("Top 10 selected features:", top_10_features)

X_train_top10 = X_train_scaled[top_10_features]
X_valid_top10 = X_valid_scaled[top_10_features]
X_test_top10 = X_test_scaled[top_10_features]

lasso = Lasso(alpha=0.1, random_state=21)
lasso.fit(X_train_top10, y_train)

y_valid_pred = lasso.predict(X_valid_top10)
valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

print("\nValidation Results with Top 10 Features:")
print(f"Validation RMSE: {valid_rmse:.4f}")
print(f"Validation MSE: {valid_mse:.4f}")
print(f"Validation R²: {valid_r2:.4f}")

Top 10 selected features: ['Pre-War', 'Roof Deck', 'Hardwood Floors', 'Dishwasher', 'Fitness Center', 'No Fee', 'Laundry in Unit', 'High Speed Internet', 'longitude', 'Dogs Allowed']

Validation Results with Top 10 Features:
Validation RMSE: 1844.8409
Validation MSE: 3403437.9786
Validation R²: 0.1177


  c /= stddev[:, None]
  c /= stddev[None, :]


In [29]:
from sklearn.inspection import permutation_importance

In [30]:
X_train, X_valid, X_test, y_train, y_valid, y_test = chat_train_test_valid_split(
    X_preprocessed.head(1000), y.head(1000), test_size=0.2, valid_size=0.2
)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_valid_scaled = pd.DataFrame(scaler.transform(X_valid), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

lasso = Lasso(alpha=0.1, random_state=21)
lasso.fit(X_train_scaled, y_train)

perm_importance = permutation_importance(
    lasso, X_valid_scaled, y_valid, scoring="neg_mean_squared_error", random_state=42
)

feature_importances = perm_importance.importances_mean
feature_names = X_train.columns

importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": feature_importances
})

importance_df = importance_df.sort_values(by="importance", ascending=False)
print("Feature Importance:\n", importance_df)

top_10_features = importance_df.head(10)["feature"].tolist()
print("\nTop 10 features:", top_10_features)

X_train_top10 = X_train_scaled[top_10_features]
X_valid_top10 = X_valid_scaled[top_10_features]
X_test_top10 = X_test_scaled[top_10_features]

lasso_top10 = Lasso(alpha=0.1, random_state=21)
lasso_top10.fit(X_train_top10, y_train)

y_valid_pred = lasso_top10.predict(X_valid_top10)
valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

print("\nValidation Results with Top 10 Features:")
print(f"Validation RMSE: {valid_rmse:.4f}")
print(f"Validation MSE: {valid_mse:.4f}")
print(f"Validation R²: {valid_r2:.4f}")

Feature Importance:
                   feature    importance
1                bedrooms  1.029866e+06
0               bathrooms  1.011084e+06
6          interest_level  3.606773e+05
11                Doorman  2.231198e+05
17        Laundry in Unit  1.119860e+05
15         Fitness Center  9.856549e+04
13                 No Fee  4.480461e+04
9         Hardwood Floors  4.440982e+04
8            Cats Allowed  2.518080e+04
21    High Speed Internet  1.596566e+04
26                Terrace  1.517501e+04
4               longitude  8.823258e+03
19          Outdoor Space  4.844142e+03
3              listing_id  2.212025e+03
27  manager_id_no manager  0.000000e+00
2                latitude -4.684025e+02
10           Dogs Allowed -5.961842e+02
18              Roof Deck -1.040550e+03
22                Balcony -1.525247e+03
16                Pre-War -2.362490e+03
20            Dining Room -2.637189e+03
24    Laundry In Building -6.129569e+03
7                Elevator -6.269222e+03
14    Laundry in Bu

In [31]:
import shap

In [32]:
# Step 4: Use SHAP to compute feature importance
explainer = shap.LinearExplainer(lasso, X_train_scaled, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_train_scaled)

shap_importance = np.abs(shap_values).mean(axis=0)
feature_names = X_train.columns

shap_importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": shap_importance
})

shap_importance_df = shap_importance_df.sort_values(by="importance", ascending=False)
print("SHAP Feature Importance:\n", shap_importance_df)

top_10_features = shap_importance_df.head(10)["feature"].tolist()
print("\nTop 10 features:", top_10_features)

X_train_top10 = X_train_scaled[top_10_features]
X_valid_top10 = X_valid_scaled[top_10_features]
X_test_top10 = X_test_scaled[top_10_features]

lasso_top10 = Lasso(alpha=0.1, random_state=21)
lasso_top10.fit(X_train_top10, y_train)

y_valid_pred = lasso_top10.predict(X_valid_top10)
valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

print("\nValidation Results with Top 10 Features:")
print(f"Validation RMSE: {valid_rmse:.4f}")
print(f"Validation MSE: {valid_mse:.4f}")
print(f"Validation R²: {valid_r2:.4f}")

SHAP Feature Importance:
                   feature  importance
1                bedrooms  651.921815
0               bathrooms  454.900543
11                Doorman  315.923565
6          interest_level  294.870938
15         Fitness Center  159.299937
9         Hardwood Floors  153.263539
17        Laundry in Unit  150.907712
13                 No Fee  141.065184
8            Cats Allowed   96.082095
14    Laundry in Building   89.175609
3              listing_id   61.775694
23          Swimming Pool   45.376114
10           Dogs Allowed   42.364524
21    High Speed Internet   41.911853
12             Dishwasher   33.230697
7                Elevator   27.817875
24    Laundry In Building   26.722729
25       New Construction   24.962287
5                  photos   16.717719
22                Balcony   15.127863
4               longitude   13.328528
16                Pre-War    9.133075
26                Terrace    9.008596
19          Outdoor Space    5.402767
20            Dining Roo



## 

### Compare the quality of these methods for different aspects — speed, metrics and stability.

Speed - all of them lighning fast
Metrics - shap more realistic
Stability - all quite unstable

In [33]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

elastic_net = ElasticNet(random_state=21)

params_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0.1, 0.25, 0.5, 0.7, 0.9, 1.0]
}

grid_search = GridSearchCV(
    estimator=elastic_net,
    param_grid=params_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)

print("Best parameters from Grid Search:", grid_search.best_params_)
print("Best score from Grid Search (negative MSE):", grid_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters from Grid Search: {'alpha': 100, 'l1_ratio': 1.0}
Best score from Grid Search (negative MSE): -6162158.653919515


In [34]:
param_dist = {
    'alpha': np.logspace(-3, 3, 100),  
    'l1_ratio': np.linspace(0.1, 1.0, 100)  
}

random_search = RandomizedSearchCV(
    estimator=elastic_net,
    param_distributions=param_dist,
    n_iter=50,  # Number of random combinations to try
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train_scaled, y_train)

print("Best parameters from Random Search:", random_search.best_params_)
print("Best score from Random Search (negative MSE):", random_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters from Random Search: {'l1_ratio': 0.5636363636363636, 'alpha': 1.072267222010323}
Best score from Random Search (negative MSE): -6262062.935818916


In [35]:
# Step 6: Evaluate the best model from Grid Search
best_model_grid = grid_search.best_estimator_
y_valid_pred_grid = best_model_grid.predict(X_valid_scaled)
valid_mse_grid = mean_squared_error(y_valid, y_valid_pred_grid)
valid_r2_grid = r2_score(y_valid, y_valid_pred_grid)

print("\nValidation Results from Grid Search:")
print(f"Validation MSE: {valid_mse_grid:.4f}")
print(f"Validation R²: {valid_r2_grid:.4f}")

# Step 7: Evaluate the best model from Random Search
best_model_random = random_search.best_estimator_
y_valid_pred_random = best_model_random.predict(X_valid_scaled)
valid_mse_random = mean_squared_error(y_valid, y_valid_pred_random)
valid_r2_random = r2_score(y_valid, y_valid_pred_random)

print("\nValidation Results from Random Search:")
print(f"Validation MSE: {valid_mse_random:.4f}")
print(f"Validation R²: {valid_r2_random:.4f}")


Validation Results from Grid Search:
Validation MSE: 1760650.0206
Validation R²: 0.5436

Validation Results from Random Search:
Validation MSE: 1836688.4464
Validation R²: 0.5239


In [36]:
elastic_net = ElasticNet(alpha=100, l1_ratio=1)
elastic_net.fit(X_train_top10, y_train)
y_pred = elastic_net.predict(X_test_top10)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.4f}")
print(f"Test R²: {r2:.4f}")

Test MSE: 1792506.6942
Test R²: 0.4564


In [37]:
import optuna

def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-3, 1e2)  
    l1_ratio = trial.suggest_float('l1_ratio', 0.1, 1.0)  

    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    model.fit(X_train_scaled, y_train)

    y_valid_pred = model.predict(X_valid_scaled)

    valid_mse = mean_squared_error(y_valid, y_valid_pred)

    # Return the negative MSE (Optuna minimizes the objective)
    return valid_mse

#  Create and run the Optuna study
study = optuna.create_study(direction='minimize') 
study.optimize(objective, n_trials=50, timeout=600)  

print("Best hyperparameters:", study.best_params)
print("Best validation MSE:", study.best_value)

# Refit the model with the best hyperparameters
best_alpha = study.best_params['alpha']
best_l1_ratio = study.best_params['l1_ratio']

best_model = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, random_state=42)
best_model.fit(X_train_scaled, y_train)

#  Evaluate the best model on the validation set
y_valid_pred = best_model.predict(X_valid_scaled)
valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
valid_mse = mean_squared_error(y_valid, y_valid_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

print("\nValidation Results with Best Hyperparameters:")
print(f"Validation RMSE: {valid_rmse:.4f}")
print(f"Validation MSE: {valid_mse:.4f}")
print(f"Validation R²: {valid_r2:.4f}")

[I 2025-03-29 04:16:06,748] A new study created in memory with name: no-name-d35bee12-4529-4c7d-a1a6-85c55a808d63
[I 2025-03-29 04:16:06,755] Trial 0 finished with value: 3493036.024266488 and parameters: {'alpha': 24.43079198781583, 'l1_ratio': 0.229001523254424}. Best is trial 0 with value: 3493036.024266488.
[I 2025-03-29 04:16:06,761] Trial 1 finished with value: 3673364.2622909034 and parameters: {'alpha': 46.60625232512089, 'l1_ratio': 0.12298163760093063}. Best is trial 0 with value: 3493036.024266488.
[I 2025-03-29 04:16:06,767] Trial 2 finished with value: 3729830.850013363 and parameters: {'alpha': 87.25629929247069, 'l1_ratio': 0.3505279311494807}. Best is trial 0 with value: 3493036.024266488.
[I 2025-03-29 04:16:06,774] Trial 3 finished with value: 2269769.818956364 and parameters: {'alpha': 5.883506180908306, 'l1_ratio': 0.7214358482247596}. Best is trial 3 with value: 2269769.818956364.
[I 2025-03-29 04:16:06,778] Trial 4 finished with value: 2529124.722084868 and parame

Best hyperparameters: {'alpha': 11.570710489390096, 'l1_ratio': 0.9901369288490579}
Best validation MSE: 1743032.7611500106

Validation Results with Best Hyperparameters:
Validation RMSE: 1320.2397
Validation MSE: 1743032.7612
Validation R²: 0.5482


# Result
### Optuna shows quite better results