# Robust regressor

# Load packages and data

In [3]:
import pandas as pd
import numpy as np
import datetime
import random
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.ensemble   import IsolationForest
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
data = pd.read_csv('../assignment-1/data/train.csv')
data.drop(labels=['property_id','property_name','property_summary','property_space','property_desc','property_neighborhood','property_notes','property_transit','property_access','property_interaction','property_rules','property_zipcode','host_location','host_about','host_nr_listings','reviews_first','reviews_last','booking_availability_30','booking_availability_60','booking_availability_90'], axis=1, inplace=True)

# Create folds randomly
constraint: same host_id -> same fold

In [21]:
def create_folds(df, num_folds):

    df['fold'] = 0
    host_ids = df['host_id'].unique()
    random.shuffle(host_ids)

    # Loop through groups and assign them to folds
    for id in host_ids:

        # Get the fold with the least number of groups
        fold_counts = df[df['host_id'] == id]['fold'].value_counts()
        fold_counts = fold_counts.sort_values()
        fold = fold_counts.index[0]
        # Assign group to fold
        df.loc[df['host_id'] == id, 'fold'] = fold
    
    # Determine how many groups should be in each set
    num_groups = len(group_keys)
    groups_per_set = num_groups // num_sets
    leftover_groups = num_groups % num_sets
    
    # Assign groups to sets
    set_assignments = {}
    current_set = 1
    for i, key in enumerate(group_keys):
        set_assignments[key] = current_set
        if (i+1) % groups_per_set == 0:
            if leftover_groups > 0:
                groups_per_set += 1
                leftover_groups -= 1
            current_set += 1
    
    # Assign set numbers to rows
    set_column = []
    for index, row in df.iterrows():
        set_column.append(set_assignments[row['host_id']])
    df['set'] = set_column

    df.drop(labels=['host_id'], axis=1, inplace=True)
    
    return df

In [22]:
create_folds(data, 10)

dict_keys([2582, 25132, 26889, 54635, 56149, 64972, 66998, 68522, 71215, 101192, 102116, 107470, 148681, 152687, 153943, 198732, 199370, 201873, 210030, 210378, 212410, 219560, 224702, 232654, 234077, 244707, 244722, 253564, 269196, 277415, 280345, 282079, 311029, 320270, 334804, 353088, 366948, 377245, 377916, 412687, 418931, 422138, 427339, 432392, 433586, 448970, 462975, 466151, 479071, 482601, 483803, 493502, 497175, 498207, 498420, 508039, 510480, 513606, 517059, 522504, 529276, 566744, 567706, 569792, 573329, 586942, 603038, 619835, 624176, 641036, 660744, 662573, 674142, 695831, 703854, 717733, 729673, 730761, 741719, 746502, 752504, 761100, 763075, 765131, 766382, 770466, 1006583, 1015024, 1020593, 1022745, 1030292, 1093398, 1104077, 1109751, 1125222, 1128692, 1133133, 1137222, 1142606, 1144478, 1148574, 1150677, 1151217, 1167377, 1185087, 1210163, 1243309, 1260050, 1263933, 1268709, 1271958, 1276012, 1301380, 1318419, 1334349, 1334881, 1357619, 1365079, 1366391, 1374437, 14006

NameError: name 'random' is not defined

# Add simple features

In [4]:
# Replace NaN values with empty string
data['host_response_time'].fillna('', inplace=True)

# Count amenities
data['amenities_count'] = data['property_amenities'].apply(lambda x: 0 if pd.isna(x) else x.count(',') + 1)

# Property city 
data['property_city'] = data['property_lat'].apply(lambda x: 1 if x > 51 else 0)

# Pipeline Components

## Preprocessing

### OneHotEncoding categorical data based on groups

In [5]:
class CustomEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, groups, col_name):
        self.groups = groups
        self.col_name = col_name
    
    def fit(self, X, y=None):
        # This transformer doesn't require any fitting, so we can just return self
        return self

    def transform(self, df):
        # create a copy of the original dataframe without the column to be grouped
        df_new = df.drop(self.col_name, axis=1).copy()

        # add a new column for each group
        for group in self.groups:
            group_name = self.col_name + '_' + group[0]
            df_new[group_name] = df[self.col_name].isin(group).astype(int)

        return df_new


### OneHotEncoding multilabel features

In [6]:
class CustomMultiLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, groups, col_name):
        self.groups = groups
        self.col_name = col_name
    
    def fit(self, X, y=None):
        # This transformer doesn't require any fitting, so we can just return self
        return self

    def transform(self, df):
        # create a copy of the original dataframe without the column to be encoded
        df_new = df.copy()
        df_new.fillna({self.col_name: df[self.col_name].mode()[0]}, inplace=True)

        # Loop through the groups and create a new column for each group if at least one of the amenities in the group is present
        for group in self.groups:
            new_col = df_new[self.col_name].apply(lambda x: 1 if len(set(x.split(', ')).intersection(group)) > 0 else 0)
            group_name = self.col_name + '_' + group[0]
            df_new = pd.concat([df_new, new_col.rename(group_name)], axis=1)

        df_new.drop(self.col_name, axis=1, inplace=True)

        return df_new

### Change *property_last_updated* to numeric

In [7]:
class ConvertToDays(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X_old):
        def convert_to_days(value, property_scraped_at, host_since):
            if value == 'today':
                return 0
            elif value == 'yesterday':
                return 1
            elif value == 'never':
                property_scraped_at = datetime.datetime.strptime(property_scraped_at, '%Y-%m-%d')
                host_since = datetime.datetime.strptime(host_since, '%Y-%m-%d')
                difference = property_scraped_at - host_since
                return difference.days
            else:
                # split the string to get the number and unit
                if value.endswith('day ago'):
                    return 1
                elif value.endswith('week ago'):
                    return 7
                elif value.endswith('month ago'):
                    return 30
                else:
                    number, unit, _ = value.split()
    
                    # convert the number to an integer
                    number = int(number)
    
                    # convert the unit to a number of days
                    if unit == 'days':
                        return number
                    elif unit == 'weeks':
                        return number * 7
                    elif unit == 'months':
                        return number * 30
                    elif unit == 'week':
                        return 7
                    elif unit == 'month':
                        return 30
                    else:
                        raise ValueError(f"Invalid unit: {unit}")
        X = X_old.copy()
        X['property_last_updated'] = X.apply(lambda x: convert_to_days(x['property_last_updated'], x['property_scraped_at'], x['host_since']) if pd.notnull(x['property_last_updated']) else x['property_last_updated'], axis=1)
        X.drop(['property_scraped_at', 'host_since'], axis=1, inplace=True)
        return X


### Feature engineering: XGBoost

In [8]:
class XGBFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_prefix=None, params=None):
        self.feature_prefix = feature_prefix
        self.params = params
        
    def fit(self, X, y=None):
        X_train = X.filter(regex=f'^{self.feature_prefix}')
        self.model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, max_depth=3, learning_rate=0.1)
        self.model.fit(X_train, y)
        return self
    
    def transform(self, X):
        X_test = X.filter(regex=f'^{self.feature_prefix}')
        y_pred = self.model.predict(X_test)
        X[f'{self.feature_prefix}xgb'] = y_pred
        X.drop(X_test.columns, axis=1, inplace=True)
        return X


### Feature engineering: KNN-location

In [9]:
class KNN_location(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.model = None
            
    def fit(self, X, y=None):
        self.model = KNeighborsRegressor(n_neighbors=k)
        self.model.fit(X_train[['property_lon','property_lat']], y)
        return self
    
    def transform(self, X):
        X['location_knn'] = self.model.predict(X[['property_lon','property_lat']])
        X.drop(['property_lon','property_lat'], axis=1, inplace=True)
        return X


# Assemble pipeline

In [1134]:
# Encoding groups
property_type_groups = [['Apartment','Serviced apartment'], ['House','Townhouse','Chalet'],['Condominium'],['Guesthouse','Guest suite'],['Loft'],['Bed & Breakfast'],['Other','Boutique hotel','Hostel','Camper/RV','Castle','Boat','Timeshare'],['Villa'],['Cabine','Earth House','Yurt','Dorm','Tent']]
property_room_type_groups = [['Private room'],['Entire home/apt'],['Shared room']]
booking_cancel_policy_groups = [['flexible'],['moderate'],['strict', 'super_strict_30']]
property_bed_type_groups = [['Couch','Airbed','Futon','Pull-out Sofa'],['Real Bed']]
host_response_time_groups = [['a few days or more'],['within a day'],['within a few hours'],['within an hour'],['']]
ammenity_groups = [[amenity] for amenity in data['property_amenities'].dropna().str.split(', ').explode().unique().tolist()]  # Each ammenity is a separate group
host_verified_groups = [[verification] for verification in data['host_verified'].dropna().str.split(', ').explode().unique().tolist()]  # Each host_verified status is a separate group
extra_groups = [[extra] for extra in data['extra'].dropna().str.split(', ').explode().unique().tolist()]  # Each extra comment is a separate group

# Imputers
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')
iterative_imputer = IterativeImputer(random_state=0)  # play around with max_iter and tol; takes 2+ minutes to run
knn_imputer = KNNImputer(n_neighbors=100, weights='uniform', metric='nan_euclidean')  # first normalize?

# Scalers
standard_scaler = StandardScaler()

# Define the full pipeline
pipeline = Pipeline([
    ('Encoder_property_type', CustomEncoder(groups=property_type_groups, col_name='property_type')),
    ('Encoder_property_room_type', CustomEncoder(groups=property_room_type_groups, col_name='property_room_type')),
    ('Encoder_booking_cancel_policy', CustomEncoder(groups=booking_cancel_policy_groups, col_name='booking_cancel_policy')),
    ('Encoder_property_bed_type', CustomEncoder(groups=property_bed_type_groups, col_name='property_bed_type')),
    ('Encoder_host_response_time', CustomEncoder(groups=host_response_time_groups, col_name='host_response_time')),
    ('MultiLabelEncoder_ammenities', CustomMultiLabelEncoder(groups=ammenity_groups, col_name='property_amenities')),  # Includes mode imputation
    ('MultiLabelEncoder_host_verified', CustomMultiLabelEncoder(groups=host_verified_groups, col_name='host_verified')),  # Includes mode imputation
    ('MultiLabelEncoder_extra', CustomMultiLabelEncoder(groups=extra_groups, col_name='extra')),  # Includes mode imputation
    ('ConvertToDays_property_last_updated', ConvertToDays()),
    #('XGBFeatureTransformer_ammenities', XGBFeatureTransformer(feature_prefix='property_amenities_')),  # May be worse
    #('XGBFeatureTransformer_extra', XGBFeatureTransformer(feature_prefix='extra_')),  # May be worse
    #('knn_location', KNN_location()),  # Training not correct
    ('standard_scaler', standard_scaler),
    ('mean_imputer', mean_imputer)
    ], verbose=True)

# Fit and transform the data using the pipeline
X_train_transformed = pipeline.fit_transform(X_train, y_train)
X_test_transformed = pipeline.transform(X_test)


[Pipeline]  (step 1 of 11) Processing Encoder_property_type, total=   0.0s
[Pipeline]  (step 2 of 11) Processing Encoder_property_room_type, total=   0.0s
[Pipeline]  (step 3 of 11) Processing Encoder_booking_cancel_policy, total=   0.0s
[Pipeline]  (step 4 of 11) Processing Encoder_property_bed_type, total=   0.0s
[Pipeline]  (step 5 of 11) Processing Encoder_host_response_time, total=   0.0s
[Pipeline]  (step 6 of 11) Processing MultiLabelEncoder_ammenities, total=   1.5s
[Pipeline]  (step 7 of 11) Processing MultiLabelEncoder_host_verified, total=   0.2s
[Pipeline]  (step 8 of 11) Processing MultiLabelEncoder_extra, total=   0.1s
[Pipeline]  (step 9 of 11) Processing ConvertToDays_property_last_updated, total=   0.1s
[Pipeline] . (step 10 of 11) Processing standard_scaler, total=   0.0s
[Pipeline] .... (step 11 of 11) Processing mean_imputer, total=   0.0s


In [10]:
# Encoding groups
property_type_groups = [['Apartment','Serviced apartment'], ['House','Townhouse','Chalet'],['Condominium'],['Guesthouse','Guest suite'],['Loft'],['Bed & Breakfast'],['Other','Boutique hotel','Hostel','Camper/RV','Castle','Boat','Timeshare'],['Villa'],['Cabine','Earth House','Yurt','Dorm','Tent']]
property_room_type_groups = [['Private room'],['Entire home/apt'],['Shared room']]
booking_cancel_policy_groups = [['flexible'],['moderate'],['strict', 'super_strict_30']]
property_bed_type_groups = [['Couch','Airbed','Futon','Pull-out Sofa'],['Real Bed']]
host_response_time_groups = [['a few days or more'],['within a day'],['within a few hours'],['within an hour'],['']]
ammenity_groups = [[amenity] for amenity in data['property_amenities'].dropna().str.split(', ').explode().unique().tolist()]  # Each ammenity is a separate group
host_verified_groups = [[verification] for verification in data['host_verified'].dropna().str.split(', ').explode().unique().tolist()]  # Each host_verified status is a separate group
extra_groups = [[extra] for extra in data['extra'].dropna().str.split(', ').explode().unique().tolist()]  # Each extra comment is a separate group

# Imputers
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')
iterative_imputer = IterativeImputer(random_state=0)  # play around with max_iter and tol; takes 2+ minutes to run
knn_imputer = KNNImputer(n_neighbors=100, weights='uniform', metric='nan_euclidean')  # first normalize?

# Scalers
standard_scaler = StandardScaler()

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.svm import SVR

X = data.drop('target', axis=1)
y = data['target']

class LassoCVTransformer(LassoCV, TransformerMixin):
    def transform(self, X):
        return X

lasso = LassoCVTransformer()
selector = SelectFromModel(lasso)
xgb = XGBRegressor(verbose=1)
en_model = ElasticNet(alpha=100, l1_ratio=0.5)
lasso_model = Lasso(alpha=100)
ridge_model = Ridge(alpha=100)
svr_model = SVR(kernel='linear', C=100, gamma='auto')

# Define the full pipeline
pipeline = Pipeline([
    ('Encoder_property_type', CustomEncoder(groups=property_type_groups, col_name='property_type')),
    ('EEncoder_property_room_type', CustomEncoder(groups=property_room_type_groups, col_name='property_room_type')),
    ('EEncoder_booking_cancel_policy', CustomEncoder(groups=booking_cancel_policy_groups, col_name='booking_cancel_policy')),
    ('EEncoder_property_bed_type', CustomEncoder(groups=property_bed_type_groups, col_name='property_bed_type')),
    ('Encoder_host_response_time', CustomEncoder(groups=host_response_time_groups, col_name='host_response_time')),
    ('MultiLabelEncoder_ammenities', CustomMultiLabelEncoder(groups=ammenity_groups, col_name='property_amenities')),  # Includes mode imputation
    ('MultiLabelEncoder_host_verified', CustomMultiLabelEncoder(groups=host_verified_groups, col_name='host_verified')),  # Includes mode imputation
    ('MultiLabelEncoder_extra', CustomMultiLabelEncoder(groups=extra_groups, col_name='extra')),  # Includes mode imputation
    ('ConvertToDays_property_last_updated', ConvertToDays()),
    #('XGBFeatureTransformer_ammenities', XGBFeatureTransformer(feature_prefix='property_amenities_')),  # May be worse
    #('XGBFeatureTransformer_extra', XGBFeatureTransformer(feature_prefix='extra_')),  # May be worse
    #('knn_location', KNN_location()),  # Training not correct
    ('standard_scaler', standard_scaler),
    ('mean_imputer', mean_imputer),
    #('model', xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, max_depth=3, learning_rate=0.1))
    #('model', ElasticNet(alpha=100, l1_ratio=0.7))
    #('lasso', lasso), 
    #('selector', selector),
    ('svr_model', svr_model)
], verbose=True)

scores = cross_val_score(pipeline, X, y, cv=10, scoring='neg_root_mean_squared_error', error_score='raise')

print("Cross-validation scores:", -scores)
print("RMSE:", -scores.mean())

[Pipeline]  (step 1 of 12) Processing Encoder_property_type, total=   0.0s
[Pipeline]  (step 2 of 12) Processing EEncoder_property_room_type, total=   0.0s
[Pipeline]  (step 3 of 12) Processing EEncoder_booking_cancel_policy, total=   0.0s
[Pipeline]  (step 4 of 12) Processing EEncoder_property_bed_type, total=   0.0s
[Pipeline]  (step 5 of 12) Processing Encoder_host_response_time, total=   0.0s
[Pipeline]  (step 6 of 12) Processing MultiLabelEncoder_ammenities, total=   1.6s
[Pipeline]  (step 7 of 12) Processing MultiLabelEncoder_host_verified, total=   0.2s
[Pipeline]  (step 8 of 12) Processing MultiLabelEncoder_extra, total=   0.1s
[Pipeline]  (step 9 of 12) Processing ConvertToDays_property_last_updated, total=   0.1s
[Pipeline] . (step 10 of 12) Processing standard_scaler, total=   0.0s
[Pipeline] .... (step 11 of 12) Processing mean_imputer, total=   0.0s
[Pipeline] ....... (step 12 of 12) Processing svr_model, total= 3.9min
[Pipeline]  (step 1 of 12) Processing Encoder_property

In [980]:
# Removing outliers
IF = IsolationForest()
IF.fit(X_train_transformed)
# Generate outlier predictions (-1 for outliers, 1 for inliers)
outlier_preds = IF.predict(X_train_transformed)
# Filter out the outliers
X_train_transformed = X_train_transformed[outlier_preds == 1]
# Print the number of outliers removed
num_outliers = len(outlier_preds[outlier_preds == -1])
print(f'Removed {num_outliers} outliers')

y_train_transformed = y_train[outlier_preds == 1]


Removed 94 outliers


In [981]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Train the model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, max_depth=3, learning_rate=0.1)
xgb_model.fit(X_train_transformed, y_train_transformed)

# Predict the target values for the test set
y_pred = xgb_model.predict(X_test_transformed)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))


RMSE: 52.601360


In [982]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

# Train the model
en_model = ElasticNet(alpha=100, l1_ratio=0.5)
en_model.fit(X_train_transformed, y_train_transformed)

# Predict the target values for the test set
y_pred = en_model.predict(X_test_transformed)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))


RMSE: 45.887416
