# Setup

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
# --- Setup ---
current_dir = os.getcwd()
input_path = os.path.join(current_dir, 'data', 'processed', 'hotelrec_2013_2017.csv.gz')
output_path = os.path.join(current_dir, 'data', 'processed', 'hotelrec_2013_2017_cleaned.csv.gz')

In [None]:
# Load full dataset (memory-safe columns)   [~40s]
df = pd.read_csv(input_path, compression='gzip')

print(f"Original shape: {df.shape}")

Original shape: (32957031, 13)


# Processing

## Remove hotels with very few reviews

In [None]:
print("\nFiltering hotels with less than 5 reviews...")
hotel_review_counts = df['hotel_id'].value_counts()
valid_hotels = hotel_review_counts[hotel_review_counts >= 5].index

In [None]:
df = df[df['hotel_id'].isin(valid_hotels)]

df.shape

## Feature Engineering - Add Season

In [4]:
def month_to_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

In [None]:
# Extract month from date   [~20s]
df['month'] = df['date'].apply(lambda x: int(str(x)[5:7]) if pd.notnull(x) else None)
df['season'] = df['month'].apply(month_to_season)

In [6]:
# Save cleaned version
print("\nSaving cleaned dataset...")
df.to_csv(output_path, index=False, compression='gzip')
print(f"Cleaned dataset saved to: {output_path}")


Saving cleaned dataset...
Cleaned dataset saved to: /root/cmpe256/cmpe256_hotel_recommendation_system/data/processed/hotelrec_2013_2017_cleaned.csv.gz


In [None]:
# Quick Check
df.head()

In [None]:
df.shape