In [None]:
0507_booking hotels

In [1]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('./0507_booking hotels.csv')

# Display the initial columns and data types
print("Initial DataFrame columns and data types:")
print(df.dtypes)

# Drop the 'location' column if it exists
if 'location' in df.columns:
    df = df.drop(columns=['location'])

# Step 1: Create a new column 'neighborhood' from the string up to the first comma
df['neighborhood'] = df['bed_type'].apply(lambda x: x.split(',')[0] if pd.notnull(x) else np.nan)

# Remove the neighborhood part from the 'bed_type' column
df['bed_type'] = df['bed_type'].apply(lambda x: ','.join(x.split(',')[1:]) if pd.notnull(x) else np.nan)

# Step 2: Extract the distance value
df['km_from_center'] = df['bed_type'].str.extract(r'(\d.{0,5})')

# Clean the km_from_center column by keeping only digits and the dot character
df['km_from_center'] = df['km_from_center'].str.replace(r'[^\d.]', '', regex=True)

# Convert the cleaned column to float
df['km_from_center'] = df['km_from_center'].astype(float)

# For values greater than 100, divide them by 1000
df.loc[df['km_from_center'] > 100, 'km_from_center'] /= 1000

# Clean the "reviews" column to keep only the number of reviews
df['reviews'] = df['reviews'].str.replace(r'\D', '', regex=True).astype(float)

# Clean the "score" column to keep only the score number
df['score'] = df['score'].astype(str).str.extract('(\d+\.?\d*)').astype(float)

df['location_rating'] = df['location_rating'].astype(str).str.extract('(\d+\.?\d*)').astype(float)

# Clean the "price" column to keep only the price number
df['price'] = df['price'].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)

# Calculate the price per night
df['price_per_night'] = df['price'] / df['los']

# Clean the "star_rating" column to take the first number if it's not N/A
df['star_rating'] = df['star_rating'].apply(lambda x: float(x[0]) if isinstance(x, str) and x != 'N/A' else np.nan)

# Change "Yes" to 1 and "No" to 0 in the "Free_cancellation" column
df['Free_cancellation'] = df['Free_cancellation'].apply(lambda x: 1 if x == 'Yes' else 0)

# Clean the "Limited_rooms" column
df['Limited_rooms'] = df['Limited_rooms'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
df['Limited_rooms'] = pd.to_numeric(df['Limited_rooms'], errors='coerce').fillna(0).astype(int)

df['Breakfast'] = df['Breakfast'].apply(lambda x: 1 if x == 'Breakfast included' else 0)


# Display the cleaned DataFrame
print("Final cleaned DataFrame:")
print(df.head())
print("DataFrame columns and data types after cleaning:")
print(df.dtypes)

# Save the cleaned DataFrame to a new CSV file
df.to_csv('./0507_cleaned_booking_hotels.csv', index=False)


Initial DataFrame columns and data types:
name                  object
location             float64
score                 object
reviews               object
room_type             object
bed_type              object
price                 object
availability          object
los                    int64
url                   object
star_rating           object
location_rating       object
Snapshot              object
Free_cancellation     object
No_prepayment         object
Limited_rooms         object
TTT                    int64
Breakfast             object
index                  int64
dtype: object
Final cleaned DataFrame:
                   name  score  reviews                           room_type  \
0    U.S. Pacific Hotel    5.6   2473.0         Standard Double (No Window)   
1        West Side YMCA    6.6  13729.0  Bunk Bed Room with Shared Bathroom   
2       Hudson Yard 408    4.4     61.0                         Double Room   
3      The Gatsby Hotel    6.3    689.0             