In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Load the data
booking_df1 = pd.read_csv("25_06_hotels_data_cleaned_with_days_name_parameters_period_encoded (1).csv")
expedia_df1 = pd.read_csv("expedia_clean_df_normalized_with_features_engineering_25_06.csv")
booking_df2 = pd.read_csv("05_07_hotels_data_cleaned_with_days_name_parameters_period_encoded (1).csv")
expedia_df2 = pd.read_csv("Clean Final_booking_expedia_combined_05_07.csv")
booking_df3 = pd.read_csv("20_06_hotels_data_cleaned_with_new_parameters_encoded (2).csv")
expedia_df3 = pd.read_csv("Clean Final_booking_expedia_combined_20_06.csv")

# Rename 'los' column in booking_df to 'LOS' to match expedia_df
booking_df1.rename(columns={'los': 'LOS'}, inplace=True)
booking_df2.rename(columns={'los': 'LOS'}, inplace=True)
booking_df3.rename(columns={'los': 'LOS'}, inplace=True)

# Merge the dataframes on common columns (e.g., name, TTT, LOS)
merged_df1 = pd.merge(booking_df1, expedia_df1, on=['name', 'TTT', 'LOS'], suffixes=('_booking', '_expedia'))
merged_df2 = pd.merge(booking_df2, expedia_df2, on=['name', 'TTT', 'LOS'], suffixes=('_booking', '_expedia'))
merged_df3 = pd.merge(booking_df3, expedia_df3, on=['name', 'TTT', 'LOS'], suffixes=('_booking', '_expedia'))


# Combine all merged dataframes
merged_df = pd.concat([merged_df1, merged_df2, merged_df3], ignore_index=True)

# Calculate the price difference
merged_df['price_difference'] = merged_df['price_per_night_booking'] - merged_df['price_per_night_expedia']

# Additional feature engineering
merged_df['price_ratio'] = merged_df['price_per_night_booking'] / merged_df['price_per_night_expedia']
merged_df['score_diff'] = merged_df['score_booking'] - merged_df['score_expedia']
merged_df['reviews_diff'] = merged_df['reviews_booking'] - merged_df['reviews_expedia']
merged_df['star_rating_diff'] = merged_df['star_rating_booking'] - merged_df['star_rating_expedia']
merged_df['location_rating_diff'] = merged_df['location_rating_booking'] - merged_df['location_rating_expedia']
merged_df['km_diff'] = merged_df['km_from_center_booking'] - merged_df['km_from_center_expedia']
merged_df['cancellation_diff'] = merged_df['Free_cancellation_booking'] - merged_df['Free_cancellation_expedia']
merged_df['breakfast_diff'] = merged_df['Breakfast_booking'] - merged_df['Breakfast_expedia']

# Replace infinity values with NaN
merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Select relevant features
features = [
    'score_expedia', 'reviews_expedia', 'star_rating_expedia', 'location_rating_expedia',
    'km_from_center_expedia', 'Free_cancellation_expedia', 'Breakfast_expedia',
    'score_booking', 'reviews_booking', 'star_rating_booking', 'location_rating_booking',
    'km_from_center_booking', 'Free_cancellation_booking', 'Breakfast_booking',
    'score_diff', 'reviews_diff', 'star_rating_diff', 'location_rating_diff',
    'km_diff', 'cancellation_diff', 'breakfast_diff', 'index', 'Index',
    'LOS', 'discount_perc', 'Option Member', 'star_rating_expedia',
    'location_rating_expedia', 'km_from_center_expedia',
]

# Handle missing values
imputer = SimpleImputer(strategy='mean')
merged_df[features] = imputer.fit_transform(merged_df[features])

# Split the data into train and test sets
X = merged_df[features]
y = merged_df['price_difference']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Ensure y_train and y_test are 1-dimensional
y_train = y_train.ravel()
y_test = y_test.ravel()

# Save the prepared data to CSV files
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
pd.DataFrame(y_train, columns=['price_difference']).to_csv('y_train.csv', index=False)
pd.DataFrame(y_test, columns=['price_difference']).to_csv('y_test.csv', index=False)
print("done")


done


0        2.463257
1        0.399456
2       -0.131144
3        1.217962
4        0.632339
           ...   
22337   -0.226972
22338    0.607906
22339   -0.678774
22340   -0.281410
22341   -0.642011
Name: price_per_night_expedia, Length: 22342, dtype: float64