In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import io
from google.colab import files

Function for Feature Engineering

In [2]:
def create_features(df):

    df_new = df.copy()

    df_new['TotalSF'] = df_new['BasementTotalSF'] + df_new['GroundFloorArea'] + df_new['UpperFloorArea']
    df_new['TotalBaths'] = (df_new['FullBaths'] +
                            0.5 * df_new['HalfBaths'] +
                            df_new['BasementFullBaths'] +
                            0.5 * df_new['BasementHalfBaths'])
    df_new['Age'] = df_new['YearSold'] - df_new['ConstructionYear']
    df_new['YearsSinceRemodel'] = df_new['YearSold'] - df_new['RenovationYear']
    df_new['Age'] = df_new['Age'].apply(lambda x: max(0, x))
    df_new['YearsSinceRemodel'] = df_new['YearsSinceRemodel'].apply(lambda x: max(0, x))
    df_new['TotalPorchSF'] = (df_new['TerraceArea'] + df_new['OpenVerandaArea'] +
                              df_new['EnclosedVerandaArea'] + df_new['SeasonalPorchArea'] +
                              df_new['ScreenPorchArea'])
    df_new['OverallQuality_x_TotalSF'] = df_new['OverallQuality'] * df_new['TotalSF']
    df_new['OverallQuality_x_Age'] = df_new['OverallQuality'] * df_new['Age']

    df_new['TotalSF_x_TotalBaths'] = df_new['TotalSF'] * df_new['TotalBaths']

    df_new['Age_x_OverallCondition'] = df_new['Age'] * df_new['OverallCondition']

    df_new['YearsSinceRemodel_x_OverallQuality'] = df_new['YearsSinceRemodel'] * df_new['OverallQuality']

    return df_new

Load Dataset

In [3]:
uploaded = files.upload()

Saving hotel_data_preprocessed.csv to hotel_data_preprocessed.csv


In [4]:
train_df = pd.read_csv(io.BytesIO(uploaded['hotel_data_preprocessed.csv']))

In [5]:
uploadd = files.upload()

Saving test.csv to test.csv


In [6]:
test_df = pd.read_csv(io.BytesIO(uploadd['test.csv']))

In [7]:
test_ids = test_df['Id']

Outlier Removal

In [8]:
# cerating total surface area column
train_df['TotalSF_for_outlier'] = train_df['BasementTotalSF'] + train_df['GroundFloorArea'] + train_df['UpperFloorArea']
original_rows = len(train_df)
# filtering the rows where the total surface area <= 4400
train_df_cleaned = train_df.loc[train_df['TotalSF_for_outlier'] <= 4400].copy()
train_df_cleaned = train_df_cleaned.drop(columns=['TotalSF_for_outlier'])
new_rows = len(train_df_cleaned)
print(f"Removed {original_rows - new_rows} outliers based on TotalSF > 4400")

# log transforming the target
y = np.log1p(train_df_cleaned["HotelValue"])

Removed 20 outliers based on TotalSF > 4400


Apply Feature Engineering and Align Columns

In [9]:
X = create_features(train_df_cleaned.drop(columns=["HotelValue", "Id"]))
test_X = create_features(test_df.drop(columns=["Id"]))

# separating numeric and categorical features
numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include=['object']).columns

# aligning train and test columns
X_cols = set(X.columns)
test_X_cols = set(test_X.columns)
shared_cols = list(X_cols.intersection(test_X_cols))
final_X_cols = [col for col in X.columns if col in shared_cols]
X = X[final_X_cols]
test_X = test_X[final_X_cols]

numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include=['object']).columns

print(f"Working with {len(final_X_cols)} shared features.")
print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

Working with 83 shared features.
Numeric features: 46
Categorical features: 37


Preprocessing Pipelines

In [10]:
# pipeline for numeric features - imputation, scaling, pca
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95, random_state=42))
])

# pipeline for categorical features - imputation, onehot encoding
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# combined pipeline
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

Elastic Net Model

In [11]:
enet = ElasticNetCV(
    l1_ratio=[.1, .5, .7, .9, .95, .99, 1.0],
    alphas=np.logspace(-4, -1, 100),
    max_iter=10000,
    cv=5,
    n_jobs=-1,
    random_state=42
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', enet)
])

Fit the model to the training data

In [12]:
model.fit(X, y)

Get predictions for the test data

In [13]:
preds_log = model.predict(test_X)
preds = np.expm1(preds_log)

Create Submission File

In [14]:
submission = pd.DataFrame({
    "Id": test_ids,
    "HotelValue": preds
})
submission['HotelValue'] = submission['HotelValue'].clip(lower=0)

submission.to_csv("submission_ElasticNet.csv", index=False)
print("submission_ElasticNet.csv created successfully!")

submission_ElasticNet.csv created successfully!
