In [1]:
# Import dependencies
import pandas as pd
import numpy as np

### Import and Display CSV

In [2]:
# Filepath
csv_filepath =  "../../data/raw/calendar.csv"

# Read CSV
calendar_csv = pd.read_csv(csv_filepath)

# Display CSV
calendar_csv

  calendar_csv = pd.read_csv(csv_filepath)


Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,1419,2025-03-03,f,$469.00,,28,730
1,1419,2025-03-04,f,$469.00,,28,730
2,1419,2025-03-05,f,$469.00,,28,730
3,1419,2025-03-06,f,$469.00,,28,730
4,1419,2025-03-07,f,$469.00,,28,730
...,...,...,...,...,...,...,...
7905912,1367402901303182875,2026-02-26,t,$110.00,,28,365
7905913,1367402901303182875,2026-02-27,t,$110.00,,28,365
7905914,1367402901303182875,2026-02-28,t,$110.00,,28,365
7905915,1367402901303182875,2026-03-01,t,$110.00,,28,365


In [3]:
# Describe the Dataset
calendar_csv.describe(include='all')

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
count,7905917.0,7905917,7905917,7905917,3283,7905917.0,7905917.0
unique,,367,2,646,11,,
top,,2025-09-02,f,$150.00,"$1,300.00",,
freq,,21660,4440532,273386,671,,
mean,6.162725e+17,,,,,25.37663,99730.03
std,5.341003e+17,,,,,40.89837,14591170.0
min,1419.0,,,,,1.0,1.0
25%,33851390.0,,,,,3.0,200.0
50%,7.727699e+17,,,,,28.0,365.0
75%,1.123414e+18,,,,,28.0,1125.0


In [4]:
calendar_csv.dtypes

listing_id         int64
date              object
available         object
price             object
adjusted_price    object
minimum_nights     int64
maximum_nights     int64
dtype: object

In [5]:
# fix datatypes
calendar_csv['available'] = calendar_csv['available'].map({'t':1, 'f':0}).astype('int64')
calendar_csv['price'] = calendar_csv['price'].str.replace('$', '').str.replace(',', '').astype('float')
calendar_csv['adjusted_price'] = calendar_csv['adjusted_price'].str.replace('$', '').str.replace(',', '').astype('float')

In [6]:
calendar_csv.dtypes

listing_id          int64
date               object
available           int64
price             float64
adjusted_price    float64
minimum_nights      int64
maximum_nights      int64
dtype: object

### Basic Analysis
- The Calendar dataset is remarkably clean with the same number of rows in all columns except of the adjusted_price bracket which is expected as most listing would only increase price on certain holidays if even. 
    - The incredibly low number of adjusted days does though suggest that we may want to remove the column altogether for the sake of the ML algorithm as the data disparity is almost too large. 

### **Aggregating listing based features**

In [7]:
# Feature engineering for each listing
listing_summary = calendar_csv.groupby('listing_id').agg(
    availability_rate=('available', 'mean'),
    mean_price=('price', 'mean'),
    min_nights_median=('minimum_nights', 'median'),
    max_nights_median=('maximum_nights', 'median'),
    first_date=('date', 'min'),
    last_date=('date', 'max'),
).reset_index()

listing_summary

Unnamed: 0,listing_id,availability_rate,mean_price,min_nights_median,max_nights_median,first_date,last_date
0,1419,0.000000,469.0,28.0,730.0,2025-03-03,2026-03-02
1,8077,0.000000,75.0,180.0,365.0,2025-03-03,2026-03-02
2,26654,0.339726,114.0,28.0,1125.0,2025-03-03,2026-03-02
3,27423,0.000000,75.0,365.0,365.0,2025-03-03,2026-03-02
4,30931,0.000000,100.0,180.0,365.0,2025-03-03,2026-03-02
...,...,...,...,...,...,...,...
21655,1366954101885217722,0.158904,80.0,28.0,90.0,2025-03-03,2026-03-02
21656,1366970333908472070,0.986301,221.0,2.0,365.0,2025-03-03,2026-03-02
21657,1366999672978459092,0.997260,150.0,28.0,365.0,2025-03-04,2026-03-03
21658,1367290343089381102,0.997260,90.0,4.0,1125.0,2025-03-04,2026-03-03


### Calendar Data Analysis
- I just realized that this calendar data is not a historical representation of airbnb booking data but rather a 1 year forecast thus making this data inherently unstable as it is bound to change throughout the year.
    - at most this dataset can be used to potray airbnb pricing trend strategies thus allowing us to see what type of properties tend to benefit fom increasing or decreasing prices through the year (expectantly on holidays or low seasons)
    - or if later used as the target in a supervised model, it may be able to be used to make an analysis that looks to answer what features allow airbnb listings to be booked well in advance