In [37]:
import json
import pandas as pd
from collections import defaultdict

## Trip Advisor

This section conducts a simple exploratory data analysis of the trip advisor dataset, along with the supplementary data that was scraped.

In [11]:
with open("../data/cmu/offering.txt") as f:
    offerings = [json.loads(jline) for jline in f.read().splitlines()]


with open("../data/cmu/review.txt") as f:
    reviews = [json.loads(jline) for jline in f.read().splitlines()]


with open('../data/cmu/trip_advisor_scraped.json', "r") as outfile:
    scraped_data = json.load(outfile)

### Offerings

In [18]:
len(offerings)

4333

In [14]:
offerings[0]

{'hotel_class': 4.0,
 'region_id': 60763,
 'url': 'http://www.tripadvisor.com/Hotel_Review-g60763-d113317-Reviews-Casablanca_Hotel_Times_Square-New_York_City_New_York.html',
 'phone': '',
 'details': None,
 'address': {'region': 'NY',
  'street-address': '147 West 43rd Street',
  'postal-code': '10036',
  'locality': 'New York City'},
 'type': 'hotel',
 'id': 113317,
 'name': 'Casablanca Hotel Times Square'}

In [17]:
unique_types = {offering['type'] for offering in offerings}
unique_types

{'hotel'}

In [57]:
{offering["address"]["region"] for offering in offerings}

{'AZ',
 'CA',
 'CO',
 'DC',
 'FL',
 'IL',
 'IN',
 'MA',
 'MD',
 'MI',
 'NC',
 'NY',
 'OH',
 'PA',
 'TN',
 'TX',
 'WA'}

In [58]:
{offering["address"]["locality"] for offering in offerings}

{'Austin',
 'Baltimore',
 'Boston',
 'Charlotte',
 'Chicago',
 'Columbus',
 'Dallas',
 'Denver',
 'Detroit',
 'El Paso',
 'Fort Worth',
 'Houston',
 'Indianapolis',
 'Jacksonville',
 'Los Angeles',
 'Memphis',
 'New York City',
 'Philadelphia',
 'Phoenix',
 'San Antonio',
 'San Diego',
 'San Francisco',
 'San Jose',
 'Seattle',
 'Washington DC'}

### Reviews

In [30]:
len(reviews)

878561

In [31]:
reviews[0]

{'ratings': {'service': 5.0,
  'cleanliness': 5.0,
  'overall': 5.0,
  'value': 5.0,
  'location': 5.0,
  'sleep_quality': 5.0,
  'rooms': 5.0},
 'title': '“Truly is "Jewel of the Upper Wets Side"”',
 'text': 'Stayed in a king suite for 11 nights and yes it cots us a bit but we were happy with the standard of room, the location and the friendliness of the staff. Our room was on the 20th floor overlooking Broadway and the madhouse of the Fairway Market. Room was quite with no noise evident from the hallway or adjoining rooms. It was great to be able to open windows when we craved fresh rather than heated air. The beds, including the fold out sofa bed, were comfortable and the rooms were cleaned well. Wi-fi access worked like a dream with only one connectivity issue on our first night and this was promptly responded to with a call from the service provider to ensure that all was well. The location close to the 72nd Street subway station is great and the complimentary umbrellas on the dri

In [65]:
rating_categories = defaultdict(lambda: [0, 0, 0])
for review in reviews:
    for category in review["ratings"]:
        rating_categories[category][1] = max(rating_categories[category][1], review["ratings"][category])
        rating_categories[category][2] = min(rating_categories[category][2], review["ratings"][category])
        rating_categories[category][0] +=  1
for k, v in rating_categories.items():
    print(f"{k}: {v}")

service: [760918, 5.0, 0]
cleanliness: [759835, 5.0, 0]
overall: [878561, 5.0, 0]
value: [753695, 5.0, 0]
location: [664904, 5.0, 0]
sleep_quality: [500903, 5.0, 0]
rooms: [705404, 5.0, 0]
check_in_front_desk: [99784, 5.0, 0]
business_service_(e_g_internet_access): [65729, 5.0, 0]


### Scraped data

In [36]:
scraped_data[0]

{'url': 'https://www.tripadvisor.com/Hotel_Review-g60763-d208454-Reviews-Sofitel_New_York-New_York_City_New_York.html',
 'location': '45 West 44th Street, New York City, NY 10036',
 'city': 'New York City',
 'country': 'United States',
 'attractions': ['Gabriel Kreuther',
  "Tony's Di Napoli - Midtown",
  'Megan’s Bar & Kitchen',
  'Hatsuhana Sushi Restaurant',
  'Broadway',
  'Grand Central Terminal',
  'Bryant Park',
  'New York Public Library']}

## AirBnB

This section conducts a simple exploratory data analysis of the airbnb dataset.

### Calendar

In [43]:
calendar_df = pd.read_csv("../data/inside/calendar.csv")
calendar_df.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,2818,2023-09-04,f,$69.00,$69.00,3,1125
1,2818,2023-09-05,t,$69.00,$69.00,3,1125
2,2818,2023-09-06,f,$69.00,$69.00,3,1125
3,2818,2023-09-07,f,$69.00,$69.00,3,1125
4,2818,2023-09-08,f,$69.00,$69.00,3,1125


In [44]:
calendar_df["available"].value_counts()

available
f    2365451
t     695440
Name: count, dtype: int64

In [50]:
calendar_df["date"].max(), calendar_df["date"].min()

('2024-09-02', '2023-09-03')

### Listings

In [51]:
listings_df = pd.read_csv("../data/inside/listings.csv")
listings_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,761411,https://www.airbnb.com/rooms/761411,20230903194145,2023-09-03,city scrape,Condo in Amsterdam · ★4.74 · 1 bedroom · 1 bed...,"Really quiet, spacious and safe, a nice place ...","The neighborhood is very green, quiet, safe an...",https://a0.muscache.com/pictures/10591508/bf30...,4013546,...,4.92,4.42,4.65,0363 D4AD DCF3 E72A 56A2,t,2,0,2,0,2.3
1,768274,https://www.airbnb.com/rooms/768274,20230903194145,2023-09-04,city scrape,Rental unit in Amsterdam · ★4.83 · 1 bedroom ·...,Our cool and spacious loft is perfect for a st...,Our neighbourhood is ons of the most beautiful...,https://a0.muscache.com/pictures/27206847/31f5...,3678216,...,4.94,4.86,4.77,0363 7A50 18E7 51D1 B7F9,t,1,1,0,0,0.64
2,768737,https://www.airbnb.com/rooms/768737,20230903194145,2023-09-03,city scrape,Boat in Amsterdam · ★4.82 · 1 bedroom · 1 bed ...,Room to rent in my houseboat. The room has a p...,It is just outside the Jordan in between the c...,https://a0.muscache.com/pictures/1df40445-354f...,3877342,...,4.87,4.72,4.68,036396BE30827DDB9575,t,3,1,2,0,2.73
3,771217,https://www.airbnb.com/rooms/771217,20230903194145,2023-09-04,previous scrape,Houseboat in Amsterdam · ★5.0 · 3 bedrooms · 3...,"Spacious houseboat in Amsterdam, suitable for ...",The houseboat lies in an area with ± 200 house...,https://a0.muscache.com/pictures/57715927/0808...,4068486,...,5.0,4.67,4.56,0363 D807 AD6C 499A F871,f,1,1,0,0,0.11
4,771343,https://www.airbnb.com/rooms/771343,20230903194145,2023-09-03,city scrape,Rental unit in Amsterdam · ★4.89 · 1 bedroom ·...,Royal Bed & Coffee Room with a very comfortabl...,The building is located in Amsterdam centre in...,https://a0.muscache.com/pictures/ea22d262-7456...,2313061,...,4.91,4.96,4.79,0363 8C61 E9B9 5582 913E,f,2,0,2,0,6.62


In [54]:
listings_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

### Reviews

In [55]:
reviews_df = pd.read_csv("../data/inside/reviews.csv")
reviews_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2818,1191,2009-03-30,10952,Lam,Daniel is really cool. The place was nice and ...
1,2818,1771,2009-04-24,12798,Alice,Daniel is the most amazing host! His place is ...
2,2818,1989,2009-05-03,11869,Natalja,We had such a great time in Amsterdam. Daniel ...
3,2818,2797,2009-05-18,14064,Enrique,Very professional operation. Room is very clea...
4,2818,3151,2009-05-25,17977,Sherwin,Daniel is highly recommended. He provided all...


In [56]:
reviews_df["date"].max(), reviews_df["date"].min()

('2023-09-03', '2009-03-30')