In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from google.colab import files

In [2]:
# Upload Files
print("Please upload your train.csv file:")
uploaded_train = files.upload()

print("Please upload your test.csv file:")
uploaded_test = files.upload()

train_filename = list(uploaded_train.keys())[0]
test_filename = list(uploaded_test.keys())[0]

Please upload your train.csv file:


Saving train_preprocessed.csv to train_preprocessed.csv
Please upload your test.csv file:


Saving test.csv to test.csv


In [3]:
# Load CSVs
train_df = pd.read_csv(train_filename)
test_df = pd.read_csv(test_filename)

In [4]:
print("\n Files uploaded successfully!")
print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)


 Files uploaded successfully!
Training data shape: (1104, 75)
Testing data shape: (260, 80)


In [5]:
# Drop ID column and split X, y

target_column = "HotelValue"
if "Id" in train_df.columns:
    train_df.drop(columns=["Id"], inplace=True)
if "Id" in test_df.columns:
    test_df.drop(columns=["Id"], inplace=True)

X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]
X_test = test_df.copy()

In [6]:
# Identify Column Type
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

print(f"\nNumeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")


Numeric columns: 36
Categorical columns: 37


In [7]:
# For numeric columns → median imputation + scaling
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# For categorical columns → mode imputation + one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# Combine both pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [9]:
# Build KNN Model Pipeline
knn = KNeighborsRegressor()
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', knn)
])

# Hyperparameter Tuning
param_grid = {
    'knn__n_neighbors': range(3, 21),
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [10]:
print("\n Best Parameters Found:")
print(grid_search.best_params_)


# Train Final Model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)


 Best Parameters Found:
{'knn__n_neighbors': 8, 'knn__p': 1, 'knn__weights': 'distance'}


In [11]:
# Evaluate on Training Data
train_preds = best_model.predict(X_train)
r2 = r2_score(y_train, train_preds)
rmse = np.sqrt(mean_squared_error(y_train, train_preds))

print("\n Training Performance:")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")


 Training Performance:
R² Score: 1.0000
RMSE: 0.0000


In [12]:
# Predict on Test Data
test_predictions = best_model.predict(X_test)

# If your test file originally had an 'Id' column, we re-load it here
original_test = pd.read_csv(test_filename)

In [13]:
# Build the final output DataFrame
output_df = pd.DataFrame({
    "Id": original_test["Id"],             # take Ids from original test file
    "HotelValue": test_predictions         # predicted values
})

output_df.to_csv("hotel_predictions_KNN.csv", index=False)

print("\n Predictions saved to 'hotel_predictions_KNN.csv' in format: Id,HotelValue")

from google.colab import files
files.download("hotel_predictions_KNN.csv")


 Predictions saved to 'hotel_predictions_KNN.csv' in format: Id,HotelValue


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>