In [1]:
import pandas as pd
import numpy as np
import datetime
import random

In [2]:
data = pd.read_csv('../assignment-1/data/train.csv')

In [20]:
# Encoding groups

#property_type_groups = [['Apartment','Serviced apartment'], ['House','Townhouse','Chalet'],['Condominium'],['Guesthouse','Guest suite'],['Loft'],['Bed & Breakfast'],['Other','Boutique hotel','Hostel','Camper/RV','Castle','Boat','Timeshare'],['Villa'],['Cabine','Earth House','Yurt','Dorm','Tent']]
property_type_groups = [[property_type] for property_type in data['property_type'].dropna().str.split(', ').explode().unique().tolist()]
#property_room_type_groups = [['Private room'],['Entire home/apt'],['Shared room']]
property_room_type_groups = [[property_room_type] for property_room_type in data['property_room_type'].dropna().str.split(', ').explode().unique().tolist()]
#booking_cancel_policy_groups = [['flexible'],['moderate'],['strict', 'super_strict_30']]
booking_cancel_policy_groups = [[booking_cancel_policy] for booking_cancel_policy in data['booking_cancel_policy'].dropna().str.split(', ').explode().unique().tolist()]
#property_bed_type_groups = [['Couch','Airbed','Futon','Pull-out Sofa'],['Real Bed']]
property_bed_type_groups = [[property_bed_type] for property_bed_type in data['property_bed_type'].dropna().str.split(', ').explode().unique().tolist()]
host_response_time_groups = [['a few days or more'],['within a day'],['within a few hours'],['within an hour'],['']]
amenity_groups = [[amenity] for amenity in data['property_amenities'].dropna().str.split(', ').explode().unique().tolist()]  # Each ammenity is a separate group
host_verified_groups = [[verification] for verification in data['host_verified'].dropna().str.split(', ').explode().unique().tolist()]  # Each host_verified status is a separate group
extra_groups = [[extra] for extra in data['extra'].dropna().str.split(', ').explode().unique().tolist()]  # Each extra comment is a separate group

In [52]:
def preprocessing(data):

    df = data.copy()

    # Replace NaN values with empty string
    df[['host_response_time','property_amenities','host_verified','extra']] = df[['host_response_time','property_amenities','host_verified','extra']].fillna('')

    # Count amenities
    df['amenities_count'] = df['property_amenities'].apply(lambda x: 0 if pd.isna(x) else x.count(',') + 1)

    # Property city 
    df['property_city'] = df['property_lat'].apply(lambda x: 1 if x > 51 else 0)

    # OneHotEncoding
    for group in property_type_groups:
        group_name = 'property_type' + '_' + group[0]
        df[group_name] = df['property_type'].isin(group).astype(int)
    for group in property_room_type_groups:
        group_name = 'property_room_type' + '_' + group[0]
        df[group_name] = df['property_room_type'].isin(group).astype(int)
    for group in booking_cancel_policy_groups:
        group_name = 'booking_cancel_policy' + '_' + group[0]
        df[group_name] = df['booking_cancel_policy'].isin(group).astype(int)
    for group in property_bed_type_groups:
        group_name = 'property_bed_type' + '_' + group[0]
        df[group_name] = df['property_bed_type'].isin(group).astype(int)
    for group in host_response_time_groups:
        group_name = 'host_response_time' + '_' + group[0]
        df[group_name] = df['host_response_time'].isin(group).astype(int)

    # MultiLabelEncoding
    for group in amenity_groups:
        new_col = df['property_amenities'].apply(lambda x: 1 if len(set(x.split(', ')).intersection(group)) > 0 else 0)
        group_name = 'property_amenities' + '_' + group[0]
        df = pd.concat([df, new_col.rename(group_name)], axis=1)
    for group in host_verified_groups:
        new_col = df['host_verified'].apply(lambda x: 1 if len(set(x.split(', ')).intersection(group)) > 0 else 0)
        group_name = 'host_verified' + '_' + group[0]
        df = pd.concat([df, new_col.rename(group_name)], axis=1)
    for group in extra_groups:
        new_col = df['extra'].apply(lambda x: 1 if len(set(x.split(', ')).intersection(group)) > 0 else 0)
        group_name = 'extra' + '_' + group[0]
        df = pd.concat([df, new_col.rename(group_name)], axis=1)


    return df


In [53]:
data_preprocessed = preprocessing(data)

In [57]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Keep only numerical columns
data_preprocessed = data_preprocessed.select_dtypes(include=['int64', 'float64'])

mean_imputer = SimpleImputer(strategy='mean')
standard_scaler = StandardScaler()
en_model = ElasticNet(alpha=1000, l1_ratio=0.7)

# Define the full pipeline
pipeline = Pipeline([
    ('standard_scaler', standard_scaler),
    ('mean_imputer', mean_imputer),
    ('en_model', en_model)
], verbose=True)

X = data_preprocessed.drop('target', axis=1)
y = data_preprocessed['target']

scores = cross_val_score(pipeline, X, y, cv=10, scoring='neg_root_mean_squared_error', error_score='raise')


print("Cross-validation scores:", -scores)
print("RMSE:", -scores.mean())

[Pipeline] ... (step 1 of 3) Processing standard_scaler, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing mean_imputer, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing en_model, total=   0.0s
[Pipeline] ... (step 1 of 3) Processing standard_scaler, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing mean_imputer, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing en_model, total=   0.0s
[Pipeline] ... (step 1 of 3) Processing standard_scaler, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing mean_imputer, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing en_model, total=   0.0s
[Pipeline] ... (step 1 of 3) Processing standard_scaler, total=   0.1s
[Pipeline] ...... (step 2 of 3) Processing mean_imputer, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing en_model, total=   0.0s
[Pipeline] ... (step 1 of 3) Processing standard_scaler, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing mean_imputer, total=   0.0s
[Pipel