In [4]:
import sys
import json
import csv
import yaml

import importlib

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

from datetime import time
from datetime import date
from datetime import datetime
# with the above choices, the imported datetime.time(2023,07,01) is recognized
# from datetime import date
# from datetime import datetime

import pprint

import psycopg2
from sqlalchemy import create_engine, text as sql_text

# Create an utilities file util.py in a folder benchmarking and import it
sys.path.append('helper_functions/')
# import util as util
import util

In [6]:
from pymongo import MongoClient

client = MongoClient()
db = client.airbnb

collection = db.listings_with_reviews_and_cal

In [25]:

db.listings_with_reviews_m.create_index('id')
db.listings_with_calendar.create_index('id')

'id_1'

In [2]:
superlative_words = [
    'astounding',
    'amazing',
    'awesome',
    'excellent',
    'exceptional',
    'extraordinary',
    'fantastic',
    'great',
    'magnificent',
    'splendid',
    'wonderful'
]


super_negative_words = [
    'aweful',
    'horrible',
    'terrible'
]

### Query 5 Pos

In [11]:
regex = '|'.join(superlative_words)

query_5_pos_condition = {
    'reviews.comments': {
        '$regex': regex,
        '$options': 'i'  
    }
}


time1 = datetime.now()

query_5_pos_result = collection.find(query_5_pos_condition)
time2 = datetime.now()
print(f'The time taken for the selection was {(time2 - time1).total_seconds()} seconds.')

time3 = datetime.now()
query_5_pos_list = list(query_5_pos_result)
time4 = datetime.now()
print(f'The time taken to create the list was {(time4 - time3).total_seconds()} seconds.')
print(f'Number of listings found: {len(query_5_pos_list)}')

The time taken for the selection was 0.0 seconds.
The time taken to create the list was 17.360703 seconds.
Number of listings found: 25196


### Query 5 neg


In [10]:
regex = '|'.join(super_negative_words)

# Define the query condition
query_5_neg_condition = {
    'reviews.comments': {
        '$regex': regex,
        '$options': 'i' 
    }
}

time1 = datetime.now()
query_5_neg_result = collection.find(query_5_neg_condition)
time2 = datetime.now()
print(f'The time taken for the selection was {(time2 - time1).total_seconds()} seconds.')


time3 = datetime.now()
query_5_neg_list = list(query_5_neg_result)
time4 = datetime.now()
print(f'The time taken to create the list was {(time4 - time3).total_seconds()} seconds.')

print(f'Number of listings found: {len(query_5_neg_list)}')

The time taken for the selection was 0.0 seconds.
The time taken to create the list was 7.203021 seconds.
Number of listings found: 1672


## Query 6 pos

In [14]:
query_6_pos_condition = {
    '$and': [
        query_5_pos_condition,
        {'dates_list.date': {'$gte': datetime(2025, 2, 1)}},  # Listings available on or after February 1, 2025
        {'average_price': {'$lte': 200}}  # Average price of the listing is $200 or less
    ]
}


time1 = datetime.now()
query_6_pos_result = collection.find(query_6_pos_condition)
time2 = datetime.now()
print(f'The time taken for the selection was {(time2 - time1).total_seconds()} seconds.')

time3 = datetime.now()
query_6_pos_list = list(query_6_pos_result)
time4 = datetime.now()
print(f'The time taken to create the list was {(time4 - time3).total_seconds()} seconds.')

print(f'Number of listings found: {len(query_6_pos_list)}')

The time taken for the selection was 0.0 seconds.
The time taken to create the list was 16.616319 seconds.
Number of listings found: 19228


## Query 6 neg

In [15]:
query_6_neg_condition = {
    '$and': [
        query_5_neg_condition,
        {'dates_list.date': {'$gte': datetime(2025, 2, 1)}}, 
        {'average_price': {'$lte': 200}} 
    ]
}

time1 = datetime.now()
query_6_neg_result = collection.find(query_6_neg_condition)
time2 = datetime.now()
print(f'The time taken for the selection was {(time2 - time1).total_seconds()} seconds.')

time3 = datetime.now()
query_6_neg_list = list(query_6_neg_result)
time4 = datetime.now()
print(f'The time taken to create the list was {(time4 - time3).total_seconds()} seconds.')

print(f'Number of listings found: {len(query_6_neg_list)}')

The time taken for the selection was 0.0 seconds.
The time taken to create the list was 5.536341 seconds.
Number of listings found: 1281


### Index Creation of comments in all of the reviews  arryas

In [17]:

time1 = datetime.now()
index_name = db.listings_with_reviews_and_cal.create_index({'reviews.comments': 'text'})
time2 = datetime.now()
print(f'The time taken for the review.comments index creation was {(time2 - time1).total_seconds()} seconds.')

The time taken for the review.comments index creation was 37.550677 seconds.


In [18]:
cursor = db.listings_with_reviews_and_cal.index_information()
cursor1 = db.listings_with_reviews_and_cal.list_indexes()
for i in cursor:
    print(i)

print()
for i in cursor1:
    print(i)

_id_
reviews.comments_text

SON([('v', 2), ('key', SON([('_id', 1)])), ('name', '_id_')])
SON([('v', 2), ('key', SON([('_fts', 'text'), ('_ftsx', 1)])), ('name', 'reviews.comments_text'), ('weights', SON([('reviews.comments', 1)])), ('default_language', 'english'), ('language_override', 'language'), ('textIndexVersion', 3)])


### Query 7 Pos

In [19]:
text_search = ' '.join(superlative_words)
condition_ind = {'$text': {'$search': text_search}}


time1 = datetime.now()
query_7_pos_result = collection.find(condition_ind)
time2 = datetime.now()
print(f'The time taken for the selection was {(time2 - time1).total_seconds()} seconds.')

time3 = datetime.now()
query_7_pos_list = list(query_7_pos_result)
time4 = datetime.now()
print(f'The time taken to create the list was {(time4 - time3).total_seconds()} seconds.')
print(f'Number of listings found: {len(query_7_pos_list)}')

The time taken for the selection was 0.0 seconds.
The time taken to create the list was 18.096243 seconds.
Number of listings found: 25197


### Query 7 Neg

In [20]:
text_search = ' '.join(super_negative_words)
condition_ind = {'$text': {'$search': text_search}}


time1 = datetime.now()
query_7_neg_result = collection.find(condition_ind)
time2 = datetime.now()
print(f'The time taken for the selection was {(time2 - time1).total_seconds()} seconds.')

time3 = datetime.now()
query_7_neg_list = list(query_7_neg_result)
time4 = datetime.now()
print(f'The time taken to create the list was {(time4 - time3).total_seconds()} seconds.')
print(f'Number of listings found: {len(query_7_neg_list)}')

The time taken for the selection was 0.0 seconds.
The time taken to create the list was 1.578671 seconds.
Number of listings found: 1930
