In [1]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('./2506_booking hotels.csv')

# Display the initial columns and data types
print("Initial DataFrame columns and data types:")
print(df.dtypes)

# Drop the 'location' column if it exists
if 'location' in df.columns:
    df = df.drop(columns=['location'])

# Step 1: Create a new column 'neighborhood' from the string up to the first comma
df['neighborhood'] = df['bed_type'].apply(lambda x: x.split(',')[0] if pd.notnull(x) else np.nan)

# Remove the neighborhood part from the 'bed_type' column
df['bed_type'] = df['bed_type'].apply(lambda x: ','.join(x.split(',')[1:]) if pd.notnull(x) else np.nan)

# Step 2: Extract the distance value
df['km_from_center'] = df['bed_type'].str.extract(r'(\d.{0,5})')

# Clean the km_from_center column by keeping only digits and the dot character
df['km_from_center'] = df['km_from_center'].str.replace(r'[^\d.]', '', regex=True)

# Convert the cleaned column to float
df['km_from_center'] = df['km_from_center'].astype(float)

# For values greater than 100, divide them by 1000
df.loc[df['km_from_center'] > 100, 'km_from_center'] /= 1000

# Clean the "reviews" column to keep only the number of reviews
df['reviews'] = df['reviews'].str.replace(r'\D', '', regex=True).astype(float)

# Clean the "score" column to keep only the score number
df['score'] = df['score'].astype(str).str.extract('(\d+\.?\d*)').astype(float)

df['location_rating'] = df['location_rating'].astype(str).str.extract('(\d+\.?\d*)').astype(float)

# Clean the "price" column to keep only the price number
df['price'] = df['price'].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)

# Calculate the price per night
df['price_per_night'] = df['price'] / df['los']

# Clean the "star_rating" column to take the first number if it's not N/A
df['star_rating'] = df['star_rating'].apply(lambda x: float(x[0]) if isinstance(x, str) and x != 'N/A' else np.nan)

# Change "Yes" to 1 and "No" to 0 in the "Free_cancellation" column
df['Free_cancellation'] = df['Free_cancellation'].apply(lambda x: 1 if x == 'Yes' else 0)

# Clean the "Limited_rooms" column
df['Limited_rooms'] = df['Limited_rooms'].apply(lambda x: 1 if x == 'Yes' else (0 if x == 'No' else x))
df['Limited_rooms'] = pd.to_numeric(df['Limited_rooms'], errors='coerce').fillna(0).astype(int)

df['Breakfast'] = df['Breakfast'].apply(lambda x: 1 if x == 'Breakfast included' else 0)


# Display the cleaned DataFrame
print("Final cleaned DataFrame:")
print(df.head())
print("DataFrame columns and data types after cleaning:")
print(df.dtypes)

# Save the cleaned DataFrame to a new CSV file
df.to_csv('./2506_cleaned_booking_hotels.csv', index=False)


Initial DataFrame columns and data types:
name                  object
location             float64
score                 object
reviews               object
room_type             object
bed_type              object
price                 object
availability          object
los                    int64
url                   object
star_rating           object
location_rating       object
Snapshot              object
Free_cancellation     object
No_prepayment         object
Limited_rooms         object
TTT                    int64
Breakfast             object
index                  int64
dtype: object
Final cleaned DataFrame:
                        name  score  reviews  \
0         Bowery Grand Hotel    3.5   1414.0   
1         U.S. Pacific Hotel    5.6   2450.0   
2             West Side YMCA    6.6  13601.0   
3             Hotel Moca NYC    4.3     98.0   
4  The Nolita Express Hostel    7.1    139.0   

                                   room_type  \
0  Standard Double Room with Sh

In [2]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('./2506_cleaned_booking_hotels.csv')

# Ensure the price column is treated as a float
# Assuming 'price_per_night' is already numerical
df['price_per_night'] = df['price_per_night'].astype(float)

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['price_per_night'].quantile(0.25)
Q3 = df['price_per_night'].quantile(0.75)

# Calculate the IQR (Interquartile Range)
IQR = Q3 - Q1

# Determine the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df_no_outliers = df[(df['price_per_night'] >= lower_bound) & (df['price_per_night'] <= upper_bound)]

print(f"Original DataFrame shape: {df.shape}")
print(f"DataFrame shape after removing outliers: {df_no_outliers.shape}")

df_no_outliers.to_csv('./2506_cleaned_booking_hotels.csv', index=False)

df_no_outliers.T

Original DataFrame shape: (26320, 21)
DataFrame shape after removing outliers: (25849, 21)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,26310,26311,26312,26313,26314,26315,26316,26317,26318,26319
name,U.S. Pacific Hotel,West Side YMCA,Hotel Moca NYC,The Nolita Express Hostel,KAMA CENTRAL PARK,Hudson River Hotel,Radio Hotel,Hudson Yard 408,Nap York Central Park Sleep Station,Lamartine Chelsea,...,"Gild Hall, A Thompson Hotel, by Hyatt",45 Times Square Hotel,JG Sohotel,Heritage Hotel New York City,"Distrikt Hotel New York City, Tapestry Collect...",Fitzpatrick Grand Central,AC Hotel by Marriott New York Downtown,Moderne Hotel,Hampton Inn Times Square Central,Renaissance New York Chelsea Hotel
score,5.6,6.6,4.3,7.1,7.6,5.5,8.0,4.9,8.2,6.5,...,8.6,8.3,7.3,7.2,7.5,8.2,8.5,7.9,8.3,8.0
reviews,2450.0,13601.0,98.0,139.0,2062.0,2360.0,1307.0,44.0,4369.0,437.0,...,354.0,1762.0,201.0,3477.0,346.0,632.0,1721.0,952.0,2987.0,913.0
room_type,Standard Double (No Window),Bunk Bed Room with Shared Bathroom,Deluxe Queen Room,Small Vintage Pod - All Gender,Single Bed in Female Dormitory Room,King Room - Non-Smoking,Standard Studio,Double Room,Economy Pod in 6 Bed Mixed Dorm,Double Room with Shared Bathroom,...,King Room,Standard Queen Room,Queen Room,King Room - Non-Smoking,Queen Room,Deluxe Room,King Guest Room - One King Bed - Non-Smoking,Deluxe Room,King Room - Non-Smoking,"Guest Room, 1 King"
bed_type,New YorkShow on map5.7 km from centreMetro ac...,New YorkShow on map350 m from centreMetro access,New YorkShow on map9.9 km from centreMetro ac...,New YorkShow on map5.2 km from centreMetro ac...,New YorkShow on map3.8 km from centreMetro ac...,New YorkShow on map1.9 km from centreMetro ac...,New YorkShow on map9.8 km from centreMetro ac...,New YorkShow on map1.9 km from centreMetro ac...,New YorkShow on map300 m from centreMetro access,New YorkShow on map2.4 km from centreMetro ac...,...,New YorkShow on map7 km from centreMetro access,New YorkShow on map1.2 km from centreMetro ac...,New YorkShow on map5.7 km from centreMetro ac...,New YorkShow on map2.8 km from centreMetro ac...,New YorkShow on map1.6 km from centreMetro ac...,New YorkShow on map1.8 km from centreMetro ac...,New YorkShow on map7.2 km from centreMetro ac...,New YorkShow on map350 m from centreMetro access,New YorkShow on map1.5 km from centreMetro ac...,New YorkShow on map2.8 km from centreMetro ac...
price,129.0,130.0,157.0,188.0,197.0,199.0,204.0,210.0,211.0,243.0,...,1420.0,1420.0,1421.0,1421.0,1423.0,1423.0,1427.0,1427.0,1428.0,1432.0
availability,"1 night, 2 adults","1 night, 2 adults","1 night, 2 adults","1 night, 2 adults","1 night, 2 adults","1 night, 2 adults","1 night, 2 adults","1 night, 2 adults","1 night, 2 adults","1 night, 2 adults",...,"5 nights, 2 adults","5 nights, 2 adults","5 nights, 2 adults","5 nights, 2 adults","5 nights, 2 adults","5 nights, 2 adults","5 nights, 2 adults","5 nights, 2 adults","5 nights, 2 adults","5 nights, 2 adults"
los,1,1,1,1,1,1,1,1,1,1,...,5,5,5,5,5,5,5,5,5,5
url,https://www.booking.com/hotel/us/us-pacific.en...,https://www.booking.com/hotel/us/west-side-ymc...,https://www.booking.com/hotel/us/moca-nyc.en-g...,https://www.booking.com/hotel/us/the-nolita-ex...,https://www.booking.com/hotel/us/kama-central-...,https://www.booking.com/hotel/us/inn-midtown-w...,https://www.booking.com/hotel/us/radio.en-gb.h...,https://www.booking.com/hotel/us/hudson-new-yo...,https://www.booking.com/hotel/us/nap-york-new-...,https://www.booking.com/hotel/us/incentra-home...,...,https://www.booking.com/hotel/us/gild-hall-tho...,https://www.booking.com/hotel/us/45-times-squa...,https://www.booking.com/hotel/us/jg-sohotel.en...,https://www.booking.com/hotel/us/heritage-new-...,https://www.booking.com/hotel/us/distrikt-new-...,https://www.booking.com/hotel/us/fitzpatrick-g...,https://www.booking.com/hotel/us/ac-new-york-d...,https://www.booking.com/hotel/us/monderne.en-g...,https://www.booking.com/hotel/us/hampton-inn-t...,https://www.booking.com/hotel/us/renaissance-n...
star_rating,1.0,3.0,3.0,,1.0,3.0,4.0,,1.0,1.0,...,4.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0
