In [2]:
# Chicago Airbnb Reviews Analysis Notebook
# This notebook loads Airbnb data for Chicago, cleans and processes it,
# filters for English reviews, calculates seasonal distributions, business status,
# and prepares data for 2022 and 2024 for further analysis.

# Import necessary libraries

In [3]:
import numpy as np
import pandas as pd
import fasttext
from tqdm import tqdm
import os

# Load raw data

In [4]:
listings_path = "../data/raw/chicago_listings.csv.gz"
reviews_path  = "../data/raw/chicago_reviews.csv.gz"

listings_df = pd.read_csv(listings_path, compression="gzip")
reviews_df  = pd.read_csv(reviews_path, compression="gzip")

# Merge listings and reviews

In [5]:
chicago_reviews = reviews_df.merge(
    listings_df,
    left_on='listing_id',
    right_on='id',
    how='left',
    suffixes=('_review', '_listing')
)

# Count total reviews per listing

In [6]:
chicago_reviews['listing_total_reviews'] = chicago_reviews.groupby('listing_id')['listing_id'].transform('count')

# Select relevant columns and add city/establishment type

In [7]:
chicago_reviews_clean = chicago_reviews[[
    'listing_id', 'id_review', 'date', 'reviewer_id', 'comments',
    'name', 'property_type', 'room_type', 'neighbourhood_cleansed',
    'latitude', 'longitude', 'host_id', 'host_total_listings_count',
    'host_name', 'number_of_reviews', 'review_scores_rating',
    'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
    'review_scores_communication', 'review_scores_location', 'review_scores_value',
    'accommodates', 'bedrooms', 'beds', 'price'
]].copy()

chicago_reviews_clean['city'] = 'Chicago'
chicago_reviews_clean['establishment_type'] = 'Airbnb Accommodation'

# Convert date column and extract year

In [8]:
chicago_reviews_clean['date'] = pd.to_datetime(chicago_reviews_clean['date'], errors='coerce')
chicago_reviews_clean['visit_date'] = chicago_reviews_clean['date']
chicago_reviews_clean['year'] = chicago_reviews_clean['date'].dt.year

# Determine season from month

In [9]:
def get_season(date):
    if pd.isna(date):
        return None
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

chicago_reviews_clean['season'] = chicago_reviews_clean['date'].apply(get_season)

# FastText for English reviews

In [10]:
model_path = "../data/processed/models/lid.176.ftz"
model = fasttext.load_model(model_path)

def detect_language(text):
    if not isinstance(text, str) or not text.strip():
        return None
    label = model.predict(text.replace("\n"," "))[0][0]
    return label.replace("__label__", "")

def filter_year_language(df, year, model):
    df_year = df[df['date'].dt.year==year].copy()
    batch_size = 50000
    languages = []
    for i in tqdm(range(0, len(df_year), batch_size), desc=f"Detecting language {year}"):
        batch = df_year.iloc[i:i+batch_size]
        languages.extend(batch['comments'].apply(detect_language).tolist())
    df_year['language'] = languages
    return df_year[df_year['language']=='en'].copy()

chicago_reviews_2022 = filter_year_language(chicago_reviews_clean, 2022, model)
chicago_reviews_2024 = filter_year_language(chicago_reviews_clean, 2024, model)

Detecting language 2022: 100%|█████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.94s/it]
Detecting language 2024: 100%|█████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.35s/it]


# Add month and season for 2022/2024

In [11]:
chicago_reviews_2022['month'] = chicago_reviews_2022['date'].dt.month
chicago_reviews_2022['season'] = chicago_reviews_2022['month'].apply(lambda x: get_season(pd.Timestamp(month=x, day=1, year=2022)))


chicago_reviews_2024['month'] = chicago_reviews_2024['date'].dt.month
chicago_reviews_2024['season'] = chicago_reviews_2024['month'].apply(lambda x: get_season(pd.Timestamp(month=x, day=1, year=2024)))

# Reviewer total reviews

In [16]:
# Ensure review ID column name is consistent
for df in [chicago_reviews_2022, chicago_reviews_2024]:
    if 'review_id' in df.columns:
        df.rename(columns={'review_id': 'id_review'}, inplace=True)
    if 'id' in df.columns and 'id_review' not in df.columns:
        df.rename(columns={'id': 'id_review'}, inplace=True)

In [19]:
for df, year in [(chicago_reviews_2022, 2022), (chicago_reviews_2024, 2024)]:

    # Remove old column if present
    if 'reviewer_total_reviews' in df.columns:
        df.drop(columns=['reviewer_total_reviews'], inplace=True)

    # Calculate new counts
    reviewer_counts = df.groupby('reviewer_id')['id_review'].count().reset_index()
    reviewer_counts.rename(columns={'id_review': 'reviewer_total_reviews'}, inplace=True)

    # Merge
    merged_df = df.merge(reviewer_counts, on='reviewer_id', how='left')

    # Assign back
    if year == 2022:
        chicago_reviews_2022 = merged_df
    else:
        chicago_reviews_2024 = merged_df

In [20]:
print("Example 2022:")
display(chicago_reviews_2022[['reviewer_id', 'reviewer_total_reviews']].head())

Example 2022:


Unnamed: 0,reviewer_id,reviewer_total_reviews
0,153590859,1
1,103271804,2
2,138753984,1
3,54621446,1
4,103271804,2


# Business status

In [21]:
combined = pd.concat([chicago_reviews_2022[['listing_id','date']], chicago_reviews_2024[['listing_id','date']]], ignore_index=True)
last_reviews = combined.groupby('listing_id')['date'].max().reset_index().rename(columns={'date':'last_review'})

for df in [chicago_reviews_2022, chicago_reviews_2024]:
    df['last_review'] = df['listing_id'].map(last_reviews.set_index('listing_id')['last_review'])

chicago_reviews_2022['business_status'] = np.where(
    chicago_reviews_2022['last_review'].isna(), 'No Reviews',
    np.where(chicago_reviews_2022['last_review'] < pd.Timestamp('2022-01-01'), 'Shuttered', 'Active')
)
chicago_reviews_2024['business_status'] = np.where(
    chicago_reviews_2024['last_review'].isna(), 'No Reviews',
    np.where(chicago_reviews_2024['last_review'] < pd.Timestamp('2022-01-01'), 'Shuttered', 'Active')
)


# Determine business status

In [25]:
combined = pd.concat([chicago_reviews_2022[['listing_id','date']], chicago_reviews_2022[['listing_id','date']]], ignore_index=True)
last_reviews = combined.groupby('listing_id')['date'].max().reset_index().rename(columns={'date':'last_review'})


for df in [chicago_reviews_2022, chicago_reviews_2024]:
    df['last_review'] = df['listing_id'].map(last_reviews.set_index('listing_id')['last_review'])


chicago_reviews_2022['business_status'] = np.where(
    chicago_reviews_2022['last_review'].isna(), 'No Reviews',
    np.where(chicago_reviews_2022['last_review'] < pd.Timestamp('2022-01-01'), 'Shuttered', 'Active')
)


chicago_reviews_2024['business_status'] = np.where(
    chicago_reviews_2024['last_review'].isna(), 'No Reviews',
    np.where(chicago_reviews_2024['last_review'] < pd.Timestamp('2022-01-01'), 'Shuttered', 'Active')
)


# Prepare final features and save

In [26]:
for df in [chicago_reviews_2022, chicago_reviews_2024]:
    df['price_num'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    # if there are no amenities columns, create an empty list
    if 'amenities' in df.columns:
        df['amenities_count'] = df['amenities'].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
    else:
        df['amenities_count'] = 0

# Final_columns

In [27]:
final_columns = [
    'listing_id', 'id_review', 'date', 'reviewer_id', 'reviewer_name', 'comments',
    'language','year','season','reviewer_total_reviews','name','property_type',
    'room_type','neighbourhood_cleansed','latitude','longitude','host_id',
    'host_total_listings_count','host_name','number_of_reviews','review_scores_rating',
    'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
    'review_scores_communication','review_scores_location','review_scores_value',
    'accommodates','bedrooms','beds','price_num','amenities_count','business_status',
    'establishment_type','city'
]


## Ensure all columns exist

In [28]:
for df in [chicago_reviews_2022, chicago_reviews_2024]:
    for col in final_columns:
        if col not in df.columns:
            df[col] = np.nan

## Select only final columns

In [29]:
chicago_reviews_2022_final = chicago_reviews_2022_final[final_columns].copy()
chicago_reviews_2024_final = chicago_reviews_2024_final[final_columns].copy()

## Save as Parquet and CSV

In [30]:
chicago_reviews_2022_final.to_parquet("../data/processed/chicago_reviews_2022_final.parquet", index=False)
chicago_reviews_2024_final.to_parquet("../data/processed/chicago_reviews_2024_final.parquet", index=False)
chicago_reviews_2022_final.to_csv("../data/processed/chicago_reviews_2022_final.csv", index=False)
chicago_reviews_2024_final.to_csv("../data/processed/chicago_reviews_2024_final.csv", index=False)

print("✅ Files saved successfully")

✅ Files saved successfully


In [None]:
import pandas as pd

# load CSV 
chicago_reviews_2022_final = pd.read_csv("../data/processed/chicago_reviews_2022_final.csv")
chicago_reviews_2024_final = pd.read_csv("../data/processed/chicago_reviews_2024_final.csv")

In [31]:
chicago_reviews_2024_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101672 entries, 0 to 101671
Data columns (total 35 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   listing_id                   101672 non-null  int64         
 1   id_review                    101672 non-null  int64         
 2   date                         101672 non-null  datetime64[ns]
 3   reviewer_id                  101672 non-null  int64         
 4   reviewer_name                0 non-null       float64       
 5   comments                     101672 non-null  object        
 6   language                     101672 non-null  object        
 7   year                         101672 non-null  int32         
 8   season                       101672 non-null  object        
 9   reviewer_total_reviews       101672 non-null  int64         
 10  name                         101672 non-null  object        
 11  property_type             

In [32]:
chicago_reviews_2022_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62396 entries, 0 to 62395
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   listing_id                   62396 non-null  int64         
 1   id_review                    62396 non-null  int64         
 2   date                         62396 non-null  datetime64[ns]
 3   reviewer_id                  62396 non-null  int64         
 4   reviewer_name                0 non-null      float64       
 5   comments                     62396 non-null  object        
 6   language                     62396 non-null  object        
 7   year                         62396 non-null  int32         
 8   season                       62396 non-null  object        
 9   reviewer_total_reviews       62396 non-null  int64         
 10  name                         62396 non-null  object        
 11  property_type                62396 non-nu