In [1]:
from google.colab import files
uploaded = files.upload()


Saving listings.csv to listings.csv
Saving reviews.csv to reviews.csv


In [2]:
import pandas as pd

listings_df = pd.read_csv("listings.csv", low_memory=False)
reviews_df = pd.read_csv("reviews.csv", low_memory=False)


In [3]:
print("Listings Shape:", listings_df.shape)
print("Reviews Shape:", reviews_df.shape)

print("Listings Columns:", listings_df.columns.tolist())
print("Reviews Columns:", reviews_df.columns.tolist())


Listings Shape: (18017, 79)
Reviews Shape: (642151, 6)
Listings Columns: ['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availabil

In [4]:
listings_ids = set(listings_df['id'])
reviews_listing_ids = set(reviews_df['listing_id'])

common_ids = listings_ids.intersection(reviews_listing_ids)

print("Total Listings:", len(listings_ids))
print("Listings with Reviews:", len(common_ids))
print("Percentage with Reviews:", len(common_ids) / len(listings_ids) * 100)

Total Listings: 18017
Listings with Reviews: 14983
Percentage with Reviews: 83.16034855969362


In [5]:
listings_with_reviews = listings_df[listings_df['id'].isin(common_ids)]
listings_without_reviews = listings_df[~listings_df['id'].isin(common_ids)]

print("Listings without reviews:", listings_without_reviews.shape[0])


Listings without reviews: 3034


In [6]:
# Group reviews by listing_id to count how many reviews each listing has
reviews_count = reviews_df.groupby('listing_id').size().reset_index(name='review_count')

# Calculate average number of reviews per listing
average_reviews = reviews_count['review_count'].mean()
print(f"Average number of reviews per listing: {average_reviews:.2f}")

# Find listing with maximum reviews
max_reviews = reviews_count.loc[reviews_count['review_count'].idxmax()]
print("\nListing with maximum reviews:")
print(max_reviews)

# Find listing with minimum reviews (excluding 0)
min_reviews = reviews_count.loc[reviews_count['review_count'].idxmin()]
print("\nListing with minimum reviews:")
print(min_reviews)

Average number of reviews per listing: 42.86

Listing with maximum reviews:
listing_id      7944819
review_count       1189
Name: 703, dtype: int64

Listing with minimum reviews:
listing_id      71867
review_count        1
Name: 6, dtype: int64
