In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import numpy as np

# Load the training and testing datasets
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

# Combine train and test data to fit LabelEncoder on all data
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Initialize the LabelEncoder
categorical_columns = ['Package Type', 'Destination', 'Start City', 'Itinerary', 'Sightseeing Places Covered']
label_encoder = LabelEncoder()

# Apply Label Encoding to each of the selected categorical columns
for col in categorical_columns:
    combined_data[col + '_Encoded'] = label_encoder.fit_transform(combined_data[col])

# Split combined data back into train and test sets
train_data = combined_data.iloc[:len(train_data)].copy()
test_data = combined_data.iloc[len(train_data):].copy()

# Drop unnecessary columns from train_data and test_data
columns_to_drop = ['Uniq Id', 'Package Name', 'Package Type', 'Airline', 'Destination', 'Places Covered',
                   'Hotel Details', 'Start City', 'Itinerary', 'Sightseeing Places Covered', 'Cancellation Rules']
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

# Convert 'Travel Date' to datetime format and extract year, month, day
train_data['Travel Date'] = pd.to_datetime(train_data['Travel Date'], format='%d-%m-%Y')
train_data['Travel Year'] = train_data['Travel Date'].dt.year
train_data['Travel Month'] = train_data['Travel Date'].dt.month
train_data['Travel Day'] = train_data['Travel Date'].dt.day
train_data.drop(columns=['Travel Date'], inplace=True)

test_data['Travel Date'] = pd.to_datetime(test_data['Travel Date'], format='%d-%m-%Y')
test_data['Travel Year'] = test_data['Travel Date'].dt.year
test_data['Travel Month'] = test_data['Travel Date'].dt.month
test_data['Travel Day'] = test_data['Travel Date'].dt.day
test_data.drop(columns=['Travel Date'], inplace=True)

# Ensure that both datasets have the same columns
X = train_data.drop(columns=['Per Person Price'])
y = train_data['Per Person Price']

# Align the test set with the training set
test_data = test_data[X.columns]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate on the validation set
y_pred = model.predict(X_valid)
rmsle = np.sqrt(mean_squared_log_error(y_valid, y_pred))
print(f'RMSLE: {rmsle}')

# Predict on the test set
test_data['Per Person Price'] = model.predict(test_data)

# Prepare the submission file
submission = test_data[['Per Person Price']]
submission.to_csv('submission.csv', index=False)


RMSLE: 0.2560462867804376
