# Introduction

This notebook contains the code used to process the raw data into json files that are uploaded to MongoDB. Run the code starting from the last section (Process raw trip advisor data into jsons for mongodb) and up.

In [2]:
import json
import uuid
from random import randint
from collections import defaultdict
from time import strftime, localtime

## Update reviews

Add reviewer_id and listing_id to the reviews collection for easy reference to the user and accommodation collections. Also change the datetime format to something that can be easily compared by MongoDB.

In [3]:
with open("db_data/mongo_trip_advisor/reviews.json", 'r') as f:
    reviews = json.load(f)

with open("db_data/mongo_trip_advisor/accomodations.json", 'r') as f:
    accoms = json.load(f)

with open("db_data/mongo_trip_advisor/users.json", 'r') as f:
    users = json.load(f)

In [7]:
rev = {r["_id"]: r for r in reviews}

In [9]:
for user in users:
    for r in user["reviews"]:
        rev[r]["reviewer_id"] = user["_id"]

In [11]:
for accom in accoms:
    for r in accom["reviews"]:
        rev[r]["listing_id"] = accom["_id"]

In [19]:
new_reviews = list(rev.values())

In [31]:
for i in range(len(new_reviews)):
    new_reviews[i]["date"] = datetime.strptime(new_reviews[i]["date"], "%B %d, %Y").strftime("%Y-%m-%d")

In [33]:
with open(f"db_data/mongo_trip_advisor/reviews.json", 'w') as f:
    json.dump(new_reviews, f)

# Add neighbourhood_city to accoms
Add the city where each accommodation is located in for easy reference to the neighbourhood collection.

In [54]:
with open("db_data/mongo_trip_advisor/accomodations.json", 'r') as f:
    accoms = json.load(f)

with open("db_data/mongo_trip_advisor/neighborhoods.json", 'r') as f:
    neighborhoods = json.load(f)

In [49]:
for i in range(len(accoms)):
    accom = accoms[i]
    city = accom["address"].rsplit(',')[-2].strip()
    for n in neighborhoods:
        if n["city"] == city:
            accoms[i]["neighbourhood_city"] = n["_id"]

In [52]:
with open("db_data/mongo_trip_advisor/accomodations.json", 'w') as f:
        json.dump(accoms, f)

## Check if trip advisor and airbnb data is the same format

Just a sanity check to ensure trip advisor and airbnb data has the same format.

In [12]:
with open("db_data/mongo_airbnb/accommodation.json", 'r') as f:
    accom1 = json.load(f)

with open("db_data/mongo_trip_advisor/accomodations.json", 'r') as f:
    accom2 = json.load(f)

In [3]:
accom1[0]

{'_id': 3314819,
 'url': 'https://www.airbnb.com/rooms/3314819',
 'name': 'Home in Asheville · ★4.75 · 1 bedroom · 1 bed · 1 private bath',
 'address': 'Asheville, North Carolina, United States',
 'address_detailed': '35.49233, -82.51495',
 'host': {'name': 'Allison & Peter',
  'host_since': '2014-06-02',
  'host_description': 'Hello! My husband and I are looking forward to hosting guests in our home as well as being a guest of others during our travels. We became AirBnB hosts in 2014 and have since hosted more than 2,000 bookings. In 2018, we became AirBnB superhosts!\n\nWe live in Asheville in the beautiful mountains of western North Carolina.\n\nAllison works as a fourth grade teacher at Mills River Elementary School. Peter works as the Trails Coordinator at a regional land trust, Conserving Carolina. Both of us work to keep this region we love as wonderful as it is today, and make it even better for future generations.\n\nWe love to take road trips throughout the year. We have appr

In [57]:
with open("db_data/mongo_airbnb/neighbourhood.json", 'r') as f:
    neighborhood1 = json.load(f)

with open("db_data/mongo_trip_advisor/neighborhoods.json", 'r') as f:
    neighborhood2 = json.load(f)

In [70]:
neighborhood1[4]

{'_id': {'$oid': '654a2db4936008a88bb201a9'},
 'city': 'Broward County',
 'country': 'United States',
 'attractions': ['Broward Center for the Performing Arts',
  'Westfield Broward',
  'Greater Fort Lauderdale & Broward County Convention Center',
  'Broward County Main Library',
  'Sawgrass Mills',
  'Bass Pro Shops',
  'Broward County Transit Division',
  'Broward Stage Door Theatre',
  'John Pennekamp Coral Reef State Park',
  'Sebastian Inlet State Park',
  'The Boys Farmers Market',
  'Miami Beach Convention Center',
  'Hialeah Racetrack',
  'Palm Beach County Convention Center',
  'Fern Forest Nature Center',
  'Lake Worth Playhouse',
  'Tree Tops Park',
  'Muvico Broward 18 Theater',
  'Ultimate Florida Tours',
  'Las Olas Boulevard',
  'Hollywood Beach',
  'Hollywood Beach Broadwalk',
  'Gulfstream Park Racing and Casino',
  'Las Olas Beach',
  'Fort Lauderdale Beach',
  'Seminole Hard Rock Hollywood Casino',
  'Anne Kolb Nature Center',
  'Wild Lime Adventures',
  'Bonnet Hous

In [59]:
neighborhood2[0]

{'_id': '06da9cc4-dcae-45ae-a07a-100a4546863b',
 'city': 'Baltimore',
 'country': 'United States',
 'attractions': ['Inner Harbor',
  'Sherwood Gardens',
  'Sotto Sopra',
  'Hippodrome Theatre',
  'Baltimore Farmers Market and Bazaar',
  'The Charmery',
  'Baltimore Wicked History Tours',
  'Chaps Pit Beef',
  'Red Brick Station',
  'Little Italy',
  'Man vs Fries',
  'Red Brick Station Restaurant',
  "Gertrude's",
  'The Land of Kush',
  "Dalesio's of Little Italy Restaurant",
  'Fleet Street Spirits',
  'Tov Pizza',
  "Papa John's Pizza",
  'Samos Restaurant',
  'Panera Bread',
  'Oriole Park at Camden Yards',
  'The Prime Rib',
  'Horseshoe Casino',
  'Maryland Science Center',
  'Primo Chicken',
  "Mama's On the Half Shell",
  'Babe Ruth Birthplace and Museum',
  "Pizza Boli's",
  "Schultz's Crab House",
  'Saturday Morning Cafe',
  'Ledo Pizza',
  'Patisserie Poupon',
  'USS Torsk',
  "Michael's Steak & Lobster HSE",
  'Pitango Gelato',
  'Seasons Pizza Essex',
  'Pratt Street Ale

In [2]:
with open("db_data/mongo_airbnb/reviews.json", 'r') as f:
    reviews1 = json.load(f)

with open("db_data/mongo_trip_advisor/reviews.json", 'r') as f:
    reviews2 = json.load(f)

In [63]:
reviews1[0]

{'_id': 553741,
 'listing_id': 108061,
 'date': '2011-09-21',
 'reviewer_id': 822907,
 'text': 'Lisa is superb hostess, she will treat you like family and provide you with the coziest little home in Asheville which will definitely enhance your experience of the magical town! Just like the Eco-retreat, the Private sunny apartment is a neat little flat with all you need for up to 3 people, the place was impeccable in lovely neighborhood. You can hardly beat this one!',
 'title': 'Review by Pedro & Katie',
 'overall_score': 4.2904731043709985,
 'cleanliness_score': 4.299846133212141,
 'service_score': 3.9317781425743314,
 'value_score': 4.912669318788565,
 'location_score': 4.810791486549641,
 'sleepquality_score': 3.9582933598643137,
 'checkin_score': 4.1250927018295735,
 'communication_score': 4.13585001230414}

In [65]:
reviews2[1]

{'_id': 'bf188bdc-c872-4ce0-bb1d-37a33861e006',
 'date': 'December 17, 2012',
 'title': '“My home away from home!”',
 'text': 'On every visit to NYC, the Hotel Beacon is the place we love to stay. So conveniently located to Central Park, Lincoln Center and great local restaurants. The rooms are lovely - beds so comfortable, a great little kitchen and new wizz bang coffee maker. The staff are so accommodating and just love walking across the street to the Fairway supermarket with every imaginable goodies to eat (if you choose not to go out for every meal!)',
 'overall_score': 5.0,
 'cleanliness_score': 5.0,
 'service_score': 5.0,
 'value_score': 5.0,
 'location_score': 5.0,
 'sleepquality_score': 5.0,
 'checkin_score': 0.0,
 'communication_score': 2.0}

In [3]:
with open("db_data/mongo_airbnb/transactions.json", 'r') as f:
    transactions1 = json.load(f)

with open("db_data/mongo_trip_advisor/transactions.json", 'r') as f:
    transactions2 = json.load(f)

In [66]:
transactions1[0]

{'_id': {'$oid': '654a26b4936008a88bb1b9cc'},
 'date_start': 'May 02, 2016',
 'date_end': 'May 03, 2016',
 'price': 790,
 'review_id': 14717244,
 'accomodation_id': 3314819}

In [67]:
transactions2[0]

{'_id': '81e556be-619b-4426-8c8c-2deeceae5f4f',
 'date_start': 'September 12, 2019',
 'date_end': 'September 18, 2019',
 'price': 1488,
 'review_id': '24019ac8-cd37-4ee6-ada1-cfde73f89fbd',
 'accomodation_id': '71efac41-11df-48fe-9560-f021d5ef7a6d'}

In [62]:
with open("db_data/mongo_airbnb/user.json", 'r') as f:
    users1 = json.load(f)

with open("db_data/mongo_trip_advisor/users.json", 'r') as f:
    users2 = json.load(f)

In [68]:
users1[0]

{'_id': 16296123,
 'name': 'Allison & Peter',
 'location': {'$numberDouble': 'NaN'},
 'host': {'host_since': '2014-06-02',
  'host_description': 'Hello! My husband and I are looking forward to hosting guests in our home as well as being a guest of others during our travels. We became AirBnB hosts in 2014 and have since hosted more than 2,000 bookings. In 2018, we became AirBnB superhosts!\n\nWe live in Asheville in the beautiful mountains of western North Carolina.\n\nAllison works as a fourth grade teacher at Mills River Elementary School. Peter works as the Trails Coordinator at a regional land trust, Conserving Carolina. Both of us work to keep this region we love as wonderful as it is today, and make it even better for future generations.\n\nWe love to take road trips throughout the year. We have appreciated the kindness of many over the years who have let us stay in their home or camp in their yard, or have rented a room to us for a reasonable rate. We are guests who are focused o

In [69]:
users2[0]

{'_id': 'e2263dda-df2e-4a1a-b22f-2c6844ecfcac',
 'name': 'canicetravels',
 'location': 'Chicago, Illinois',
 'reviews': ['c788ec93-4cdc-4ba8-bfbf-7582dd97480d']}

# Process raw trip advisor data into jsons for mongodb

Original processing of trip advisor data into json collections for mongoDB.

In [2]:
with open("data/cmu/offering.txt") as f:
    offering = [json.loads(jline) for jline in f.read().splitlines()]


with open("data/cmu/review.txt") as f:
    old_reviews = [json.loads(jline) for jline in f.read().splitlines()]


with open('data/cmu/trip_advisor_scraped.json', "r") as outfile:
    data = json.load(outfile)

In [4]:
####### REVIEWS
reviews = list()
user_review = defaultdict(list)
accom_review = defaultdict(list)
for review in old_reviews:
    r = {
        "_id": str(uuid.uuid4()), 
        "date": review["date"], 
        "title": review["title"], 
        "text": review["text"], 
        "overall_score": review["ratings"]["overall"], 
        "cleanliness_score": review["ratings"].get("cleanliness", float(randint(0, 5))), 
        "service_score": review["ratings"].get("service", float(randint(0, 5))), 
        "value_score": review["ratings"].get("value", float(randint(0, 5))), 
        "location_score": review["ratings"].get("location", float(randint(0, 5))), 
        "sleepquality_score": review["ratings"].get("sleep_quality", float(randint(0, 5))), 
        "checkin_score": review["ratings"].get("checkin", float(randint(0, 5))),  ########
        "communication_score": review["ratings"].get("communication", float(randint(0, 5)))  ########
    }
    ur_key = (review["author"]["username"], review["author"]["location"])
    user_review[ur_key].append(r["_id"])
    accom_review[review["offering_id"]].append(r["_id"])
    reviews.append(r)

####### USERS
users = set()
for review in old_reviews:
    users.add((review["author"]["username"], review["author"]["location"]))

users = [{"_id": str(uuid.uuid4()), "name": u[0], "location": u[1], "reviews": user_review[u]} for u in users]

####### NEIGHBORHOOD
neighborhood = list()
ccs = {(d["city"], d["country"]) for d in data}
for cc in ccs:
    n = {
        "_id": str(uuid.uuid4()), 
        "city": cc[0], 
        "country": cc[1], 
        "attractions": set(), 
    }
    for d in data:
        if d["city"] == cc[0] and d["country"] == cc[1]:
            for att in d["attractions"]:
                n["attractions"].add(att)
    n["attractions"] = list(n["attractions"])
    neighborhood.append(n)

####### ACCOMS
def str_address(a):
    s = a.get('street-address', "")
    loc = a.get("locality", "")
    reg = a.get("region", "")
    pc = a.get("postal-code", "")
    return f"{s}, {loc}, {reg} {pc}"


def get_city_uuid(city):
    for n in neighborhood:
        if n["city"] == city:
            return n["_id"]
    raise ValueError("No city found")


accoms = list()
for offer in offering:
    """
    Some have missing hotel class and postal code
    """
    accom = {
        "_id": str(uuid.uuid4()), 
        "hotel_class": offer.get("hotel_class", ""), 
        "name": offer["name"], 
        "url": offer["url"], 
        "phone": offer["phone"], 
        "address": str_address(offer["address"]), 
        "neighbourhood_city": get_city_uuid(offer["address"]["locality"]), 
        "type": "hotel", 
        "reviews": accom_review[offer["id"]], 
    }
    accoms.append(accom)

####### TRANSACTIONS
transactions = list()
for accom in accoms:
    aid = accom["_id"]
    for rid in accom["reviews"]:
        start_epoch = randint(1100000000, 1700000000)
        duration = randint(1, 14)
        end_epoch = start_epoch + 86400 * duration
        t = {
            "_id": str(uuid.uuid4()), 
            "date_start": strftime('%B %d, %Y', localtime(start_epoch)), 
            "date_end": strftime('%B %d, %Y', localtime(end_epoch)), 
            "price": randint(100, 1000) * duration, 
            "review_id": rid, 
            "accomodation_id": aid, 
        }
        transactions.append(t)
"""
reviews
users
neighborhood
accoms
transactions
"""

'\nreviews\nusers\nneighborhood\naccoms\ntransactions\n'

In [6]:
d = {"reviews": reviews, 
     "users": users, 
     "neighborhoods": neighborhood, 
     "accomodations": accoms, 
     "transactions": transactions, 
     }

for k, v in d.items():
    with open(f"db_data/{k}.json", 'w') as f:
        json.dump(v, f)