In [1]:
# LA Airbnb Reviews Analysis Notebook
# This notebook loads Airbnb data for Los Angeles, cleans and processes it,
# filters for English reviews, calculates seasonal distributions, business status,
# and prepares data for 2022 and 2024 for further analysis.

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import fasttext
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import os

# Load raw data

In [4]:
listings_df = pd.read_csv("../data/raw/listings.csv.gz", compression="gzip")
reviews_df = pd.read_csv("../data/raw/reviews.csv.gz", compression="gzip")

# Merge listings and reviews

In [5]:
la_reviews = reviews_df.merge(
listings_df,
left_on='listing_id',
right_on='id',
how='left',
suffixes=('_review', '_listing')
)

## Count total reviews per listing

In [6]:
la_reviews['listing_total_reviews'] = la_reviews.groupby('listing_id')['listing_id'].transform('count')

# Select relevant columns and add city/establishment type

In [7]:
la_reviews_clean = la_reviews[[
'listing_id', 'id_review', 'date', 'reviewer_id', 'comments',
'name', 'property_type', 'room_type', 'neighbourhood_cleansed',
'latitude', 'longitude', 'host_id', 'host_total_listings_count',
'host_name', 'number_of_reviews', 'review_scores_rating',
'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
'review_scores_communication', 'review_scores_location', 'review_scores_value',
'accommodates', 'bedrooms', 'beds', 'price'
]].copy()


la_reviews_clean['city'] = 'Los Angeles'
la_reviews_clean['establishment_type'] = 'Airbnb Accommodation'

# Convert date column and extract year

In [10]:
la_reviews_clean['date'] = pd.to_datetime(la_reviews_clean['date'], errors='coerce')
la_reviews_clean['visit_date'] = la_reviews_clean['date']
la_reviews_clean['year'] = la_reviews_clean['date'].dt.year

# Determine season from month

In [13]:
def get_season(date):
    if pd.isna(date):
        return None
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

la_reviews_clean['season'] = la_reviews_clean['date'].apply(get_season)


# Load FastText model for language detection

In [16]:
model = fasttext.load_model("lid.176.ftz")

def detect_language(text):
    if not isinstance(text, str) or not text.strip():
        return None
    label = model.predict(text.replace("\n", " "))[0][0]
    return label.replace("__label__", "")



# Filter English reviews by year

In [17]:
def filter_year_language(df, year, model):
    df_year = df[df['date'].dt.year == year].copy()
    batch_size = 50000
    languages = []
    for i in tqdm(range(0, len(df_year), batch_size), desc=f"Detecting language for {year}"):
        batch = df_year.iloc[i:i+batch_size]
        languages.extend(batch['comments'].apply(detect_language).tolist())
    df_year['language'] = languages
    return df_year[df_year['language'] == 'en'].copy()


la_reviews_2022 = filter_year_language(la_reviews_clean, 2022, model)
la_reviews_2024 = filter_year_language(la_reviews_clean, 2024, model)

Detecting language for 2022: 100%|█████████████████████████████████████████████████████████████████████| 5/5 [00:16<00:00,  3.23s/it]
Detecting language for 2024: 100%|█████████████████████████████████████████████████████████████████████| 7/7 [00:24<00:00,  3.43s/it]


# Add month and season columns

In [18]:
la_reviews_2022['month'] = la_reviews_2022['date'].dt.month
la_reviews_2022['season'] = la_reviews_2022['month'].apply(lambda x: get_season(pd.Timestamp(month=x, day=1, year=2022)))


la_reviews_2024['month'] = la_reviews_2024['date'].dt.month
la_reviews_2024['season'] = la_reviews_2024['month'].apply(lambda x: get_season(pd.Timestamp(month=x, day=1, year=2024)))

# Calculate reviewer total review

In [20]:
for df, year in [(la_reviews_2022, 2022), (la_reviews_2024, 2024)]:
    # Count total reviews per reviewer
    reviewer_counts = df.groupby('reviewer_id')['id_review'].count().reset_index()
    reviewer_counts.rename(columns={'id_review': 'reviewer_total_reviews'}, inplace=True)
    
    # Merge counts back into the DataFrame (merge returns a new DataFrame)
    merged_df = df.merge(reviewer_counts, on='reviewer_id', how='left')
    
    # Update original variable
    if year == 2022:
        la_reviews_2022 = merged_df
    else:
        la_reviews_2024 = merged_df

## Quick check

In [21]:
print("Example 2022:")
display(la_reviews_2022[['reviewer_id', 'reviewer_total_reviews']].head())

Example 2022:


Unnamed: 0,reviewer_id,reviewer_total_reviews
0,389135981,1
1,27944842,1
2,33880315,1
3,51893738,1
4,138013883,2


In [22]:
print("Example 2024:")
display(la_reviews_2024[['reviewer_id', 'reviewer_total_reviews']].head())

Example 2024:


Unnamed: 0,reviewer_id,reviewer_total_reviews
0,201664381,1
1,226451147,1
2,68037230,3
3,5489381,1
4,93236650,1


# Determine business status

In [23]:
combined = pd.concat([la_reviews_2022[['listing_id','date']], la_reviews_2024[['listing_id','date']]], ignore_index=True)
last_reviews = combined.groupby('listing_id')['date'].max().reset_index().rename(columns={'date':'last_review'})


for df in [la_reviews_2022, la_reviews_2024]:
    df['last_review'] = df['listing_id'].map(last_reviews.set_index('listing_id')['last_review'])


la_reviews_2022['business_status'] = np.where(
    la_reviews_2022['last_review'].isna(), 'No Reviews',
    np.where(la_reviews_2022['last_review'] < pd.Timestamp('2022-01-01'), 'Shuttered', 'Active')
)


la_reviews_2024['business_status'] = np.where(
    la_reviews_2024['last_review'].isna(), 'No Reviews',
    np.where(la_reviews_2024['last_review'] < pd.Timestamp('2022-01-01'), 'Shuttered', 'Active')
)

# Prepare final features and save

In [26]:
for df in [la_reviews_2022, la_reviews_2024]:
    df['price_num'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    # если колонки amenities нет, создаем пустой список
    if 'amenities' in df.columns:
        df['amenities_count'] = df['amenities'].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
    else:
        df['amenities_count'] = 0

## Define the final columns

In [27]:
final_columns = [
    'listing_id', 'id_review', 'date', 'reviewer_id', 'reviewer_name', 'comments',
    'language', 'year', 'season', 'reviewer_total_reviews', 'name', 'property_type',
    'room_type', 'neighbourhood_cleansed', 'latitude', 'longitude', 'host_id',
    'host_total_listings_count', 'host_name', 'number_of_reviews', 'review_scores_rating',
    'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
    'review_scores_communication','review_scores_location','review_scores_value',
    'accommodates','bedrooms','beds','price_num','amenities_count','business_status',
    'establishment_type','city'
]


## Ensure all columns exist

In [28]:
for df in [la_reviews_2022, la_reviews_2024]:
    for col in final_columns:
        if col not in df.columns:
            df[col] = np.nan

## Select only final columns

In [29]:
la_reviews_2022_final = la_reviews_2022[final_columns].copy()
la_reviews_2024_final = la_reviews_2024[final_columns].copy()

## Save as Parquet and CSV

In [30]:
la_reviews_2022_final.to_parquet("../data/processed/la_airbnb_reviews_2022_clean.parquet", index=False)
la_reviews_2024_final.to_parquet("../data/processed/la_airbnb_reviews_2024_clean.parquet", index=False)
la_reviews_2022_final.to_csv("../data/processed/la_airbnb_reviews_2022_clean.csv", index=False)
la_reviews_2024_final.to_csv("../data/processed/la_airbnb_reviews_2024_clean.csv", index=False)

print("✅ Files saved successfully")

✅ Files saved successfully
