In [2]:
# data_preparation.py
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np

# Load the data
df1 = pd.read_csv('Final_booking_expedia_combined_05_07.csv')
df2 = pd.read_csv('Final_booking_expedia_combined_25_06.csv')
df3 = pd.read_csv('Final_booking_expedia_combined_20_06.csv')

# Merge the dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)

# Keep only shared hotels (those that appear in both Booking and Expedia)
df_shared = df.dropna(subset=['price_per_night_expedia', 'price_per_night_booking'])

# Calculate the price difference
df_shared['price_difference'] = df_shared['price_per_night_booking'] - df_shared['price_per_night_expedia']

# Additional feature engineering
df_shared['price_ratio'] = df_shared['price_per_night_booking'] / df_shared['price_per_night_expedia']
df_shared['score_diff'] = df_shared['score_booking'] - df_shared['score_expedia']
df_shared['reviews_diff'] = df_shared['reviews_booking'] - df_shared['reviews_expedia']
df_shared['star_rating_diff'] = df_shared['star_rating_booking'] - df_shared['star_rating_expedia']
df_shared['location_rating_diff'] = df_shared['location_rating_booking'] - df_shared['location_rating_expedia']
df_shared['km_diff'] = df_shared['km_from_center_booking'] - df_shared['km_from_center_expedia']
df_shared['cancellation_diff'] = df_shared['Free_cancellation_booking'] - df_shared['Free_cancellation_expedia']
df_shared['breakfast_diff'] = df_shared['Breakfast_booking'] - df_shared['Breakfast_expedia']

# Replace infinity values with NaN and then fill them
df_shared.replace([np.inf, -np.inf], np.nan, inplace=True)

# Select relevant features
features = ['score_expedia', 'reviews_expedia', 'star_rating_expedia', 'location_rating_expedia', 
            'km_from_center_expedia', 'Free_cancellation_expedia', 'Breakfast_expedia', 
            'score_booking', 'reviews_booking', 'star_rating_booking', 'location_rating_booking', 
            'km_from_center_booking', 'Free_cancellation_booking', 'Breakfast_booking',
            'price_ratio', 'score_diff', 'reviews_diff', 'star_rating_diff', 'location_rating_diff', 
            'km_diff', 'cancellation_diff', 'breakfast_diff']
target = 'price_difference'

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df_shared[features] = imputer.fit_transform(df_shared[features])

# Split the data into train and test sets
X = df_shared[features]
y = df_shared[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Ensure y_train and y_test are 1-dimensional
y_train = y_train.ravel()
y_test = y_test.ravel()

# Save the prepared data to CSV files
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
pd.DataFrame(y_train, columns=['price_difference']).to_csv('y_train.csv', index=False)
pd.DataFrame(y_test, columns=['price_difference']).to_csv('y_test.csv', index=False)
print("done")


done
