In [None]:
import pandas as pd

reviews = pd.read_csv("reviews.csv")
model_df = pd.read_csv("model_ready_dataset.csv")


In [None]:
# Standardise dates and compute true monthly review counts for each hotel
reviews["ReviewDate"] = pd.to_datetime(reviews["ReviewDate"])
reviews["Month"] = reviews["ReviewDate"].dt.to_period("M").astype(str)

true_counts = (
    reviews
    .groupby(["HotelID", "Month"])
    .size()
    .reset_index(name="TrueReviewCount")
)


  reviews["ReviewDate"] = pd.to_datetime(reviews["ReviewDate"])


In [None]:
# Merge operational data with true review counts to assess alignment
merged_check = model_df.merge(
    true_counts,
    on=["HotelID", "Month"],
    how="left"
)


In [None]:
# Preview reported versus actual review counts
merged_check[["ReviewCount", "TrueReviewCount"]].head(10)


Unnamed: 0,ReviewCount,TrueReviewCount
0,3,3.0
1,5,5.0
2,2,3.0
3,2,2.0
4,1,2.0
5,2,4.0
6,4,2.0
7,3,1.0
8,7,1.0
9,0,2.0


In [None]:
# Identify mismatches to evaluate data consistency
mismatch = merged_check[
    merged_check["ReviewCount"] != merged_check["TrueReviewCount"]
]

len(mismatch), merged_check.shape[0]


(805, 962)

In [None]:
# Drop old ReviewCount
model_df_fixed = model_df.drop(columns=["ReviewCount"], errors="ignore")

# Merge correct counts
model_df_fixed = model_df_fixed.merge(
    true_counts,
    on=["HotelID", "Month"],
    how="left"
)

# Fill months with no reviews as 0
model_df_fixed["TrueReviewCount"] = model_df_fixed["TrueReviewCount"].fillna(0)

# Rename to ReviewCount for consistency
model_df_fixed = model_df_fixed.rename(columns={
    "TrueReviewCount": "ReviewCount"
})


In [None]:
check = model_df_fixed.merge(
    true_counts,
    on=["HotelID", "Month"],
    how="left",
    suffixes=("", "_check")
)

(check["ReviewCount"] == check["TrueReviewCount"]).mean()


np.float64(0.8565488565488566)

In [None]:
# Reconstruct ReviewCount using aggregated ground-truth values
# Drop old ReviewCount
model_df_fixed["Month"].unique()[:5]
true_counts["Month"].unique()[:5]


array(['2022-01', '2022-02', '2022-03', '2022-04', '2022-05'],
      dtype=object)

In [None]:
model_df_fixed["Month"] = pd.to_datetime(model_df_fixed["Month"]).dt.to_period("M").astype(str)
true_counts["Month"] = pd.to_datetime(true_counts["Month"]).dt.to_period("M").astype(str)


In [None]:
model_df_fixed.duplicated(subset=["HotelID", "Month"]).sum()


np.int64(0)

In [None]:
# Examine remaining mismatches for diagnostic insight
bad = check[check["ReviewCount"] != check["TrueReviewCount"]]
bad[["HotelID", "Month", "ReviewCount", "TrueReviewCount"]].head(20)


Unnamed: 0,HotelID,Month,ReviewCount,TrueReviewCount
12,1,2023-01,0.0,
13,1,2023-02,0.0,
15,1,2023-04,0.0,
21,1,2023-10,0.0,
26,2,2022-02,0.0,
28,2,2022-04,0.0,
33,2,2022-09,0.0,
49,3,2022-01,0.0,
62,3,2023-02,0.0,
68,3,2023-08,0.0,


In [None]:
# Reconfirm mismatch sample for verification
bad[["HotelID", "Month", "ReviewCount", "TrueReviewCount"]].head(20)


Unnamed: 0,HotelID,Month,ReviewCount,TrueReviewCount
12,1,2023-01,0.0,
13,1,2023-02,0.0,
15,1,2023-04,0.0,
21,1,2023-10,0.0,
26,2,2022-02,0.0,
28,2,2022-04,0.0,
33,2,2022-09,0.0,
49,3,2022-01,0.0,
62,3,2023-02,0.0,
68,3,2023-08,0.0,


In [None]:
# Hotel-months in reviews but NOT in model data
review_keys = set(zip(true_counts["HotelID"], true_counts["Month"]))
model_keys = set(zip(model_df_fixed["HotelID"], model_df_fixed["Month"]))

len(review_keys - model_keys)


22

In [None]:
# Inspect missing operational keys to understand coverage gaps
list(review_keys - model_keys)[:10]


[(9, '2024-01'),
 (10, '2022-01'),
 (33, '2024-01'),
 (2, '2024-01'),
 (15, '2024-01'),
 (37, '2022-10'),
 (4, '2024-01'),
 (28, '2024-01'),
 (4, '2023-05'),
 (19, '2024-01')]