In [10]:
import pandas as pd

# Load the uploaded CSV file to review its structure and contents
file_path = 'listings.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Display the first few rows to understand the structure of the data
data.head(), data.columns

(      id                                              name  host_id host_name  \
 0   3781                         HARBORSIDE-Walk to subway     4804     Frank   
 1   5506    ** Fort Hill Inn Private! Minutes to center!**     8229     Terry   
 2   6695     Fort Hill Inn *Sunny* 1 bedroom, condo duplex     8229     Terry   
 3   8789               Curved Glass Studio/1bd facing Park    26988      Anne   
 4  10811  Back Bay Apt Studio-3 blocks to Pru center & "T"    38997  Michelle   
 
   neighbourhood  latitude  longitude        room_type  price  minimum_nights  \
 0   East Boston  42.36413  -71.02991  Entire home/apt  125.0              29   
 1       Roxbury  42.32844  -71.09581  Entire home/apt  139.0               3   
 2       Roxbury  42.32802  -71.09387  Entire home/apt  179.0               3   
 3   Beacon Hill  42.35867  -71.06307  Entire home/apt   92.0              91   
 4      Back Bay  42.35173  -71.08685  Entire home/apt  130.0              91   
 
    number_of_revi

In [11]:
def stratified_group_sampling(data, group_col, stratify_col, bins, samples_per_bin):
    sampled_data = []
    for group, group_data in data.groupby(group_col):
        # Create bins for the stratify_col within the group
        group_data['bin'] = pd.cut(group_data[stratify_col], bins=bins, labels=False)
        
        # Sample from each bin within the group
        for bin_id, bin_data in group_data.groupby('bin'):
            if not bin_data.empty:
                if len(bin_data) > samples_per_bin:
                    sampled_bin = bin_data.sample(n=samples_per_bin, random_state=42)
                else:
                    sampled_bin = bin_data
                sampled_data.append(sampled_bin)
    
    # Combine all sampled data
    return pd.concat(sampled_data)

# Define bins for price and the number of samples to retain per bin
price_bins = [0, 50, 100, 200, 500, 1000, 2000, data['price'].max()]
samples_per_price_bin = 10

# Apply the stratified group sampling
stratified_sampled_data = stratified_group_sampling(
    data, group_col='neighbourhood', stratify_col='price', bins=price_bins, samples_per_bin=samples_per_price_bin
)

# Display the size and structure of the reduced dataset
stratified_sampled_data.shape, stratified_sampled_data.head()

((910, 18),
                       id                                               name  \
 2054            53807942                    Twin Bedroom B in #835: Allston   
 3306  967311247275960043                   Full Bedroom A in #1419: Allston   
 3353  972322923270523366               AL/R3 Boston's New Room Central Spot   
 180              4909590  ALLSTON LOADED STUDIO: Steps to BU, BC & hospi...   
 740             21038943                                        Single room   
 
         host_id host_name neighbourhood   latitude  longitude  \
 2054  297860058    Sophia       Allston  42.354900 -71.130240   
 3306  297860058    Sophia       Allston  42.356780 -71.132080   
 3353  412452734       Dan       Allston  42.350072 -71.134742   
 180    25279966        Vj       Allston  42.348380 -71.131630   
 740    46580723      Zong       Allston  42.359500 -71.129470   
 
             room_type  price  minimum_nights  number_of_reviews last_review  \
 2054     Private room   44.

In [12]:
stratified_sampled_data.to_csv('sampled_listings.csv', index=False)