In [13]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('./1606_hotels_data.csv')

# Display the initial columns and data types
print("Initial DataFrame columns and data types:")
print(df.dtypes)

# Step 1: Create a new column 'neighborhood' from the string up to the first comma
df['neighborhood'] = df['bed_type'].apply(lambda x: x.split(',')[0] if pd.notnull(x) else np.nan)

# Remove the neighborhood part from the 'bed_type' column
df['bed_type'] = df['bed_type'].apply(lambda x: ','.join(x.split(',')[1:]) if pd.notnull(x) else np.nan)

# Step 2: Extract the distance value
df['km_from_center'] = df['bed_type'].str.extract(r'(\d.{0,5})')

# Clean the km_from_center column by keeping only digits and the dot character
df['km_from_center'] = df['km_from_center'].str.replace(r'[^\d.]', '', regex=True)

# Convert the cleaned column to float
df['km_from_center'] = df['km_from_center'].astype(float)

# For values greater than 100, divide them by 1000
df.loc[df['km_from_center'] > 100, 'km_from_center'] /= 1000


# Clean the "reviews" column to keep only the number of reviews
df['reviews'] = df['reviews'].str.replace(r'\D', '', regex=True).astype(float)

# Clean the "score" column to keep only the score number
df['score'] = df['score'].astype(str).str.extract('(\d+\.?\d*)').astype(float)

# Clean the "price" column to keep only the price number
df['price'] = df['price'].astype(str).str.extract('(\d+\.?\d*)').astype(float)
df['price_per_night'] = df['price'] / df['LOS']

# Clean the "star_rating" column to take the first number if it's not N/A
df['star_rating'] = df['star_rating'].apply(lambda x: float(x[0]) if isinstance(x, str) and x != 'N/A' else np.nan)

# Change "Yes" to 1 and "No" to 0 in the "Free_cancellation" column
df['Free_cancellation'] = df['Free_cancellation'].apply(lambda x: 1 if x == 'Yes' else 0)

# Clean the "Limited_rooms" column
df['Limited_rooms'] = df['Limited_rooms'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
df['Limited_rooms'] = pd.to_numeric(df['Limited_rooms'], errors='coerce').fillna(0).astype(int)


# Display the unique room types to identify keywords
unique_room_types = df['room_type'].unique()

# Create a list of keywords for room rate categorization
keywords = [
    ('standard', 1), ('single', 1), ('twin', 1), ('double', 1),
    ('deluxe', 2), ('superior', 2), ('queen', 2), ('economy', 2),
    ('executive', 3), ('king', 3), ('studio', 3), ('family', 3),
    ('suite', 4), ('junior suite', 4), ('mini-suite', 4), ('junior', 4),
    ('master suite', 5), ('presidential', 5), ('penthouse', 5), ('villa', 5),
    ('luxury', 5), ('aparthotel', 4), ('apart-style', 4),
    ('connecting', 3), ('adjoining', 3), ('adjacent', 3), ('accessible', 3),
    ('resort', 5), ('boutique', 5), ('lodge', 2)
]

# Assign room rates based on keywords
def assign_room_rate(room_type):
    room_type_lower = str(room_type).lower()  # Convert to lowercase
    max_rate = 0  # Default rate
    for keyword, rate in keywords:
        if keyword in room_type_lower:
            max_rate = max(max_rate, rate)
    return max_rate

df['room_rate'] = df['room_type'].apply(assign_room_rate)


# Display the cleaned DataFrame
print("Final cleaned DataFrame:")
print(df.head())
print("DataFrame columns and data types after cleaning:")
print(df.dtypes)

# Save the cleaned DataFrame to a new CSV file
df.to_csv('./1606_hotels_data_cleaned.csv', index=False)

Initial DataFrame columns and data types:
name                 object
location             object
score                object
reviews              object
room_type            object
bed_type             object
price                object
availability         object
url                  object
star_rating          object
location_rating      object
Snapshot             object
Free_cancellation    object
No_prepayment        object
Limited_rooms        object
TTT                   int64
LOS                   int64
Breakfast            object
dtype: object
Final cleaned DataFrame:
                 name            location  score  reviews  \
0         Radio Hotel                 NaN    8.0   1297.0   
1     Hudson Yard 408  1.9 km from centre    4.8     41.0   
2  U.S. Pacific Hotel                 NaN    5.6   2433.0   
3      Da Vinci Hotel                 NaN    7.3   1236.0   
4  The Park Ave North                 NaN    7.6    691.0   

                     room_type  \
0             