In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/train-data/train.csv")
test_data = pd.read_csv("/kaggle/input/test-data/test.csv")


In [None]:
import pandas as pd
import numpy as np

# Assuming train_data is already loaded
train_data.columns = train_data.columns.str.strip()

# List of features that are conditional on the existence of something
conditional_mapping = {
    'PoolQuality': 'SwimmingPoolArea',          # Only fill if pool exists
    'BasementHeight': 'BasementTotalSF',
    'BasementCondition': 'BasementTotalSF',
    'BasementExposure': 'BasementTotalSF',
    'BasementFacilityType1': 'BasementTotalSF',
    'BasementFacilityType2': 'BasementTotalSF',
    'LoungeQuality': 'Lounges',
    'ParkingType': 'ParkingArea',
    'ParkingFinish': 'ParkingArea',
    'ParkingQuality': 'ParkingArea',
    'ParkingCondition': 'ParkingArea',
    'ExtraFacility': 'ExtraFacilityValue'
}

# Fill missing values for conditional features
for feature, exist_col in conditional_mapping.items():
    exists_mask = train_data[exist_col] > 0  # Where the feature logically exists
    
    # Fill numerical features with median, categorical/text with 'None' or 'Unknown'
    if train_data[feature].dtype in ['int64', 'float64']:
        median_val = train_data.loc[exists_mask, feature].median()
        train_data.loc[exists_mask, feature] = train_data.loc[exists_mask, feature].fillna(median_val)
        # Where it does NOT exist, fill 0
        train_data.loc[~exists_mask, feature] = train_data.loc[~exists_mask, feature].fillna(0)
    else:
        # Fill with 'Unknown' where feature exists
        train_data.loc[exists_mask, feature] = train_data.loc[exists_mask, feature].fillna('Unknown')
        # Fill with 'None' where feature does NOT exist
        train_data.loc[~exists_mask, feature] = train_data.loc[~exists_mask, feature].fillna('None')

# Fill remaining missing numerical columns with median
num_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    if train_data[col].isnull().sum() > 0:
        train_data[col] = train_data[col].fillna(train_data[col].median())

# Fill remaining missing categorical columns with mode
cat_cols = train_data.select_dtypes(include=['object']).columns
for col in cat_cols:
    if train_data[col].isnull().sum() > 0:
        train_data[col] = train_data[col].fillna(train_data[col].mode()[0])

# Verify no more missing values
print(train_data.isnull().sum())

**Best case Data Pre-processing used for all models**

In [None]:
# --- Encoding Section (after your preprocessing) ---
train_data_encoded = train_data.copy()

# --- Ordinal mappings ---
ordinal_maps = {
    'PoolQuality': {'None':0, 'Fa':1, 'Gd':2, 'Ex':3},
    'BasementHeight': {'None':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'BasementCondition': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'BasementExposure': {'None':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4, 'Unknown':2},
    'LoungeQuality': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'ParkingFinish': {'None':0, 'Unf':1, 'RFn':2, 'Fin':3},
    'ParkingQuality': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'ParkingCondition': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'BasementFacilityType1': {'None':0, 'Unf':1, 'LwQ':2, 'ALQ':3, 'Rec':4, 'GLQ':5, 'BLQ':6},
    'BasementFacilityType2': {'None':0, 'Unf':1, 'LwQ':2, 'ALQ':3, 'Rec':4, 'GLQ':5, 'BLQ':6},
    'ExteriorQuality': {'Fa':0, 'TA':1, 'Gd':2, 'Ex':3},
    'ExteriorCondition': {'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'HeatingQuality': {'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'KitchenQuality': {'Fa':0, 'TA':1, 'Gd':2, 'Ex':3},
    'PropertyFunctionality': {'Sev':0, 'Min2':1, 'Min1':2, 'Mod':3, 'Typ':4, 'Maj1':5, 'Maj2':6}
}

# Apply ordinal encoding
for col, mapping in ordinal_maps.items():
    if col in train_data_encoded.columns:
        train_data_encoded[col] = train_data_encoded[col].map(mapping)

# --- One-hot encode remaining nominal columns ---
cat_cols = train_data_encoded.select_dtypes(include=['object']).columns
nominal_cols = [c for c in cat_cols if c not in ordinal_maps.keys()]
train_data_encoded = pd.get_dummies(train_data_encoded, columns=nominal_cols, drop_first=True)

print("✅ Text encoding completed successfully.")
print("Encoded data shape:", train_data_encoded.shape)
print("Remaining NaN values:", train_data_encoded.isnull().sum().sum())

**Data Encoding done use ordinal mapping and One-hot encoding**

In [None]:
train_data_encoded.drop( "Id", axis=1)

**The column Id dropped as it is unnecessary for training**

In [None]:
import warnings
warnings.filterwarnings("ignore", message="use_inf_as_na option is deprecated")


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of the target
sns.histplot(train_data_encoded['HotelValue'], bins=50, kde=True)
plt.title("Distribution of HotelValue")
plt.xlabel("HotelValue ($)")
plt.ylabel("Count")
plt.show()

# Log-transform if skewed
sns.histplot(np.log1p(train_data_encoded['HotelValue']), bins=50, kde=True)
plt.title("Log-Transformed HotelValue")
plt.xlabel("Log(HotelValue + 1)")
plt.ylabel("Count")
plt.show()


**The initial plot was not a centred-gaussian and had a lot of variance. Log transform was done to get a gaussian distribution with minimal variance**

In [None]:
# Distribution of the target
sns.histplot(train_data_encoded['HotelValue'], bins=50, kde=True)
plt.title("Distribution of HotelValue")
plt.xlabel("HotelValue ($)")
plt.ylabel("Count")
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features = train_data_encoded.drop('HotelValue', axis=1)
target = train_data_encoded['HotelValue']

# Fit and transform using StandardScaler
features_scaled = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

# Combine with target
train_data_scaled = pd.concat([features_scaled, target.reset_index(drop=True)], axis=1)

**Applying standard scaler on data**

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Features and target
X = train_data_encoded.drop('HotelValue', axis=1)
y = train_data_encoded['HotelValue']

# Initialize XGBoost regressor
xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)

# Train on full dataset
xgb_model.fit(X, y)

# Predict on the same training data
y_pred_train = xgb_model.predict(X)

# Evaluate fit on training data
rmse_train = np.sqrt(mean_squared_error(y, y_pred_train))
r2_train = r2_score(y, y_pred_train)

print(f"Training RMSE: {rmse_train}")
print(f"Training R2 Score: {r2_train}")


**Applying xgboosting model to the train data**

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Features and target
X = train_data_encoded.drop('HotelValue', axis=1)
y = train_data_encoded['HotelValue']

# Initialize AdaBoost regressor
ada_model = AdaBoostRegressor(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42
)

# Train on full dataset
ada_model.fit(X, y)

# Predict on the same training data
y_pred_train = ada_model.predict(X)

# Evaluate fit on training data
rmse_train = np.sqrt(mean_squared_error(y, y_pred_train))
r2_train = r2_score(y, y_pred_train)

print(f"Training RMSE: {rmse_train}")
print(f"Training R2 Score: {r2_train}")


**Applying Adaboosting model to the train data**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Features and target
X = train_data_encoded.drop('HotelValue', axis=1)
y = train_data_encoded['HotelValue']

# Initialize Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)

# Train on full dataset
gbr_model.fit(X, y)

# Predict on the same training data
y_pred_train = gbr_model.predict(X)

# Evaluate fit on training data
rmse_train = np.sqrt(mean_squared_error(y, y_pred_train))
r2_train = r2_score(y, y_pred_train)

print(f"Training RMSE: {rmse_train}")
print(f"Training R2 Score: {r2_train}")


**Applying Gradient Boosting model to the train data**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Features and target
X = train_data_encoded.drop('HotelValue', axis=1)
y = train_data_encoded['HotelValue']

# Initialize Random Forest Regressor
rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=5,
    random_state=42,
    n_jobs=-1  # use all CPU cores for faster training
)

# Train on full dataset
rf_model.fit(X, y)

# Predict on the same training data
y_pred_train = rf_model.predict(X)

# Evaluate fit on training data
rmse_train = np.sqrt(mean_squared_error(y, y_pred_train))
r2_train = r2_score(y, y_pred_train)

print(f"Training RMSE: {rmse_train}")
print(f"Training R2 Score: {r2_train}")


**Applying Random Forrest model on the train data**

In [None]:
import pandas as pd
import numpy as np

# Assuming test_data is already loaded
test_data.columns = test_data.columns.str.strip()

# List of features that are conditional on the existence of something
conditional_mapping = {
    'PoolQuality': 'SwimmingPoolArea',          # Only fill if pool exists
    'BasementHeight': 'BasementTotalSF',
    'BasementCondition': 'BasementTotalSF',
    'BasementExposure': 'BasementTotalSF',
    'BasementFacilityType1': 'BasementTotalSF',
    'BasementFacilityType2': 'BasementTotalSF',
    'LoungeQuality': 'Lounges',
    'ParkingType': 'ParkingArea',
    'ParkingFinish': 'ParkingArea',
    'ParkingQuality': 'ParkingArea',
    'ParkingCondition': 'ParkingArea',
    'ExtraFacility': 'ExtraFacilityValue'
}

# Fill missing values for conditional features
for feature, exist_col in conditional_mapping.items():
    exists_mask = test_data[exist_col] > 0  # Where the feature logically exists
    
    # Fill numerical features with median, categorical/text with 'None' or 'Unknown'
    if test_data[feature].dtype in ['int64', 'float64']:
        median_val = test_data.loc[exists_mask, feature].median()
        test_data.loc[exists_mask, feature] = test_data.loc[exists_mask, feature].fillna(median_val)
        # Where it does NOT exist, fill 0
        test_data.loc[~exists_mask, feature] = test_data.loc[~exists_mask, feature].fillna(0)
    else:
        # Fill with 'Unknown' where feature exists
        test_data.loc[exists_mask, feature] = test_data.loc[exists_mask, feature].fillna('Unknown')
        # Fill with 'None' where feature does NOT exist
        test_data.loc[~exists_mask, feature] = test_data.loc[~exists_mask, feature].fillna('None')

# Fill remaining missing numerical columns with median
num_cols = test_data.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    if test_data[col].isnull().sum() > 0:
        test_data[col].fillna(test_data[col].median(), inplace=True)

# Fill remaining missing categorical columns with mode
cat_cols = test_data.select_dtypes(include=['object']).columns
for col in cat_cols:
    if test_data[col].isnull().sum() > 0:
        test_data[col].fillna(test_data[col].mode()[0], inplace=True)

# Verify no more missing values
print(test_data.isnull().sum())


**Data Preprocessing for test data**

In [None]:
# --- Encoding Section for test data ---
test_data_encoded = test_data.copy()

# --- Ordinal mappings (same as training) ---
ordinal_maps = {
    'PoolQuality': {'None':0, 'Fa':1, 'Gd':2, 'Ex':3},
    'BasementHeight': {'None':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'BasementCondition': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'BasementExposure': {'None':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4, 'Unknown':2},
    'LoungeQuality': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'ParkingFinish': {'None':0, 'Unf':1, 'RFn':2, 'Fin':3},
    'ParkingQuality': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'ParkingCondition': {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
    'BasementFacilityType1': {'None':0, 'Unf':1, 'LwQ':2, 'ALQ':3, 'Rec':4, 'GLQ':5, 'BLQ':6},
    'BasementFacilityType2': {'None':0, 'Unf':1, 'LwQ':2, 'ALQ':3, 'Rec':4, 'GLQ':5, 'BLQ':6},
    'ExteriorQuality': {'Fa':0, 'TA':1, 'Gd':2, 'Ex':3},
    'ExteriorCondition': {'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'HeatingQuality': {'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4},
    'KitchenQuality': {'Fa':0, 'TA':1, 'Gd':2, 'Ex':3},
    'PropertyFunctionality': {'Sev':0, 'Min2':1, 'Min1':2, 'Mod':3, 'Typ':4, 'Maj1':5, 'Maj2':6}
}

# Apply ordinal encoding
for col, mapping in ordinal_maps.items():
    if col in test_data_encoded.columns:
        test_data_encoded[col] = test_data_encoded[col].map(mapping)

# --- One-hot encode remaining nominal columns ---
cat_cols = test_data_encoded.select_dtypes(include=['object']).columns
nominal_cols = [c for c in cat_cols if c not in ordinal_maps.keys()]
test_data_encoded = pd.get_dummies(test_data_encoded, columns=nominal_cols, drop_first=True)

# Ensure the test set has the same columns as the train set
for col in train_data_encoded.columns:
    if col not in test_data_encoded.columns:
        test_data_encoded[col] = 0  # Add missing columns with zeros

# Reorder columns to match train data
test_data_encoded = test_data_encoded[train_data_encoded.columns.drop('HotelValue')]

print("✅ Test data encoding completed successfully.")
print("Encoded test data shape:", test_data_encoded.shape)
print("Remaining NaN values:", test_data_encoded.isnull().sum().sum())


**Data encoding for test data using the same method as before**

In [None]:
row_idx, col_idx = np.where(test_data_encoded.isna())
for r, c in zip(row_idx, col_idx):
    print(f"NaN at row {r}, column '{test_data_encoded.columns[c]}'")


In [None]:
# Fill NaN in 'BasementFacilityType2' with previous row's value
test_data_encoded['BasementFacilityType2'].fillna(method='ffill', inplace=True)

# Verify no NaNs remain
print("Remaining NaNs:", test_data_encoded.isna().sum().sum())


**Removing all NaN values**

In [None]:
test_data_copy = test_data_encoded.copy()

# --- 2. Drop the 'Id' column from the copy (creates a new DataFrame without Id) ---
test_data_encoded_no_id = test_data_copy.drop("Id", axis=1)

# --- 3. Optional: preview the first few rows ---
print("Shape without Id:", test_data_encoded_no_id.shape)
print(test_data_encoded_no_id.head())

**Removing Id column as it is unnecessary**

In [None]:
# --- Predict using trained XGBoost model ---
scaler.transform(test_data_encoded)  # drop 'Id' if present in features




In [None]:
y_pred = xgb_model.predict(test_data_encoded)

**Predicting values using xgboosting**

In [None]:
# Assuming `test_data` (or `test_processed`) still has the 'Id' column
submission_df = pd.DataFrame({
    'Id': test_data_copy['Id'],       # bring back Hotel ID
    'HotelValue': y_pred         # predicted values
})

# Save to CSV
submission_df.to_csv('hotelvalue_submission.csv', index=False)

# Preview first few rows
print(submission_df.head())


In [None]:
test_data_encoded.columns.tolist()


In [None]:
y_pred = gbr_model.predict(test_data_encoded)

In [None]:
# Assuming `test_data` (or `test_processed`) still has the 'Id' column
submission_df = pd.DataFrame({
    'Id': test_data_copy['Id'],       # bring back Hotel ID
    'HotelValue': y_pred         # predicted values
})

# Save to CSV
submission_df.to_csv('hotelvalue_submission.csv', index=False)

# Preview first few rows
print(submission_df.head())


**Saving the predicted values in a file**

In [None]:
print(test_data_encoded.shape)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# -----------------------------
# Features and target
# -----------------------------
X = train_data_encoded.drop('HotelValue', axis=1)
y = train_data_encoded['HotelValue']

# -----------------------------
# Log-transform the target to ensure positivity
# -----------------------------
y_log = np.log1p(y)  # log(1 + y) avoids issues if y=0

# -----------------------------
# Standardize features
# -----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# Apply PCA
# -----------------------------
pca = PCA(n_components=0.95)  # keep 95% of variance
X_pca = pca.fit_transform(X_scaled)

print(f"Original features: {X.shape[1]}, PCA components: {X_pca.shape[1]}")

# -----------------------------
# Train Linear Regression on PCA components
# -----------------------------
lin_model = LinearRegression()
lin_model.fit(X_pca, y_log)

# -----------------------------
# Predict on training data
# -----------------------------
y_pred_train_log = lin_model.predict(X_pca)
y_pred_train = np.expm1(y_pred_train_log)  # invert log-transform

# Clip negative values (just in case)
y_pred_train = np.maximum(0, y_pred_train)

# Evaluate
rmse_train = np.sqrt(mean_squared_error(y, y_pred_train))
r2_train = r2_score(y, y_pred_train)

print(f"Training RMSE: {rmse_train}")
print(f"Training R²: {r2_train}")

# -----------------------------
# Predict on test data
# -----------------------------
# Keep Id separately
test_ids = test_data_encoded['Id']
test_features = test_data_encoded.drop('Id', axis=1)

# Ensure test features have same columns as training
test_features = test_features.reindex(columns=X.columns, fill_value=0)

# Scale and apply PCA
X_test_scaled = scaler.transform(test_features)
X_test_pca = pca.transform(X_test_scaled)

# Predict and invert log-transform
y_pred_test_log = lin_model.predict(X_test_pca)
y_pred_test = np.expm1(y_pred_test_log)
y_pred_test = np.maximum(0, y_pred_test)  # clip negatives

# Create final predictions DataFrame with Id and HotelValue
predictions_df = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': y_pred_test
})

print(predictions_df.head())

# Optional: save predictions
predictions_df.to_csv('hotel_value_predictions.csv', index=False)


**Non-linear response and optimizing hyperparameters using Bayesian approach**

In [None]:
# View the first few predictions
print(predictions_df.head())

# Save to CSV for later use
predictions_df.to_csv('hotel_value_predictions.csv', index=False)


In [None]:
# Print number of rows and columns
print(predictions_df.shape)

# Optional: just number of rows
print("Number of rows:", predictions_df.shape[0])

# Optional: just number of columns
print("Number of columns:", predictions_df.shape[1])


In [None]:
y_pred = ada_model.predict(test_data_encoded)

**Predicting values for Adaboost**

In [None]:
# Assuming `test_data` (or `test_processed`) still has the 'Id' column
submission_df = pd.DataFrame({
    'Id': test_data_copy['Id'],       # bring back Hotel ID
    'HotelValue': y_pred        # predicted values
})

# Save to CSV
submission_df.to_csv('hotelvalue_submission.csv', index=False)

# Preview first few rows
print(submission_df.head())