In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

### Data preparation ###

In [734]:
# Load the CSV files
contacts = pd.read_csv("contacts.csv")
users = pd.read_csv("users.csv")
listings = pd.read_csv("listings.csv")

In [735]:
# Merge contacts with users for guest details
data = contacts.merge(users, left_on="id_guest_anon", right_on="id_user_anon", how="left")
data.rename(columns={"country": "country_guest", "words_in_user_profile": "words_in_guest_profile"}, inplace=True)

# Merge with users again for host details
data = data.merge(users, left_on="id_host_anon", right_on="id_user_anon", how="left", suffixes=("", "_host"))
data.rename(columns={"country": "country_host", "words_in_user_profile": "words_in_host_profile"}, inplace=True)

# Merge with listings
data = data.merge(listings, on="id_listing_anon", how="left")

In [736]:
# Select and reorder columns
columns = [
    "id_guest_anon", "country_guest", "words_in_guest_profile",
    "id_host_anon", "country_host", "words_in_host_profile",
    "id_listing_anon", "room_type", "listing_neighborhood", "total_reviews",
    "ts_interaction_first", "ts_reply_at_first", "ts_accepted_at_first", "ts_booking_at",
    "ds_checkin_first", "ds_checkout_first",
    "m_guests", "m_interactions", "m_first_message_length_in_characters",
    "contact_channel_first", "guest_user_stage_first"
]
data = data[columns]

# Save to data.csv
data.to_csv("data.csv", index=False)

### Data cleaning ###

In [737]:
# Filter out rows with total_reviews < 0
data = data[data['total_reviews'] >= 0]

# Reset the index for clean output
data.reset_index(drop=True, inplace=True)

In [738]:
# Remove rows where m_guests == 0
data = data[data['m_guests'] > 0]

In [748]:
# Convert timestamps to datetime
def clean_and_convert_to_datetime(ts_value):
    ts_str = str(ts_value)
    
    # If length is not 19, remove last 2 characters
    if len(ts_str) != 19:
        ts_str = ts_str[:-2]
    
    # Convert to datetime format
    return pd.to_datetime(ts_str, errors='coerce')

In [None]:
# Apply the function to the 'ts_interaction_first' column
data['ts_interaction_first'] = data['ts_interaction_first'].apply(clean_and_convert_to_datetime)

In [758]:
data['ts_reply_at_first'] = data['ts_reply_at_first'].apply(clean_and_convert_to_datetime)

In [770]:
data['ds_checkin_first'] = pd.to_datetime(data['ds_checkin_first'], errors='coerce')
data['ds_checkout_first'] = pd.to_datetime(data['ds_checkout_first'], errors='coerce')

### Data changing ###

In [771]:
# Ensure m_guests is an integer column
data['m_guests'] = data['m_guests'].astype(int)

In [772]:
data['booked'] = data['ts_booking_at'].notna().astype(int)

In [773]:
# Calculate response time in hours
data['response_time_minutes'] = ((data['ts_reply_at_first'] - data['ts_interaction_first']).dt.total_seconds() / 60)
data['response_time_minutes'] = data['response_time_minutes'].fillna(99999)

In [775]:
# Calculate the difference in days
data['stay_duration_days'] = (data['ds_checkout_first'] - data['ds_checkin_first']).dt.days

In [776]:
# Calculate the time difference in minutes
data['interaction_to_checkin_days'] = (data['ds_checkin_first'] - data['ts_interaction_first']).dt.days

In [755]:
# Replace the 'room_type' values with corresponding numerical values
data['room_type'] = data['room_type'].replace({
    'Shared room': 1,
    'Private room': 2,
    'Entire home/apt': 3
})

In [756]:
# Replace 'new' with 0 and 'past_booker' with 1
data['guest_user_stage_first'] = data['guest_user_stage_first'].replace({
    'new': 0,
    '-unknown-': 1,
    'past_booker': 2
})

### Host ###

In [777]:
# List of available feature columns
all_columns = ['total_reviews', 'm_guests', 'words_in_guest_profile', 'words_in_host_profile', 'm_first_message_length_in_characters',
               'stay_duration_days', 'response_time_minutes', 'interaction_to_checkin_days', 'room_type', 'guest_user_stage_first']  # Add all your feature columns here

# Prepare the target variable
y = data['booked']

best_accuracy = 0
best_features = []

# Iterate over all possible combinations of feature columns
for n in range(1, len(all_columns) + 1):  # n is the number of features in each combination
    for feature_combo in itertools.combinations(all_columns, n):
        X = data[list(feature_combo)]  # Select columns based on the combination
        
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Create and fit the model
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        # Make predictions
        probabilities = model.predict_proba(X_test)[:, 1]  # Probabilities for class 1 (book=1)
        predictions = (probabilities >= 0.5).astype(int)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, predictions)
        
        # Keep track of the best combination of features
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = feature_combo

# Output the best combination and the corresponding accuracy
print(f"Best feature combination: {best_features}")
print(f"Best accuracy: {best_accuracy:.4f}")

Best feature combination: ('total_reviews', 'words_in_guest_profile', 'm_first_message_length_in_characters', 'stay_duration_days', 'response_time_minutes', 'room_type')
Best accuracy: 0.7359


In [784]:
X = data[list(best_features)]  # Select columns based on the combination
        
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
probabilities = model.predict_proba(X_test)[:, 1]  # Probabilities for class 1 (book=1)
predictions = (probabilities >= 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {(accuracy*100):.2f}%")

Accuracy: 73.59%
