In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
# Adjust the path if needed

df = pd.read_csv('data/raw2/hotelrec_full_combined.csv')

# Quick look
print(df.shape)
df.head()

(50264531, 13)


Unnamed: 0,hotel_id,hotel_name,hotel_location,author,date,rating,sentiment_score,sleep quality,value,rooms,service,cleanliness,location
0,1121769,Hotel Baltic,Giulianova Province of Teramo Abruzzo,violettaf340,2019-01,5.0,0.341327,,,,,,
1,1121769,Hotel Baltic,Giulianova Province of Teramo Abruzzo,Lagaiuzza,2016-01,5.0,0.272183,,,,,,
2,1121769,Hotel Baltic,Giulianova Province of Teramo Abruzzo,ashleyn763,2014-10,5.0,0.475,,5.0,,5.0,,5.0
3,1121769,Hotel Baltic,Giulianova Province of Teramo Abruzzo,DavideMauro,2014-08,5.0,0.623636,5.0,,,5.0,5.0,
4,1121769,Hotel Baltic,Giulianova Province of Teramo Abruzzo,Alemma11,2013-08,4.0,0.218607,3.0,4.0,4.0,5.0,3.0,4.0


In [3]:
# Overview
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50264531 entries, 0 to 50264530
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   hotel_id         int64  
 1   hotel_name       object 
 2   hotel_location   object 
 3   author           object 
 4   date             object 
 5   rating           float64
 6   sentiment_score  float64
 7   sleep quality    float64
 8   value            float64
 9   rooms            float64
 10  service          float64
 11  cleanliness      float64
 12  location         float64
dtypes: float64(8), int64(1), object(4)
memory usage: 4.9+ GB


In [4]:
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing %': missing_percent
})
print(missing_df[missing_df['Missing Count'] > 0])

                Missing Count  Missing %
hotel_location             20   0.000040
author                  23889   0.047527
sleep quality        27672279  55.053292
value                23877794  47.504261
rooms                24847780  49.434023
service              14691088  29.227544
cleanliness          23858431  47.465739
location             24743058  49.225682


In [5]:
# Drop rows where either 'author' or 'hotel_location' is NaN
df_cleaned = df.dropna(subset=['author', 'hotel_location'])

In [6]:
print(f"Original shape: {df.shape}")
print(f"New shape after dropping rows with null author or hotel_location: {df_cleaned.shape}")

Original shape: (50264531, 13)
New shape after dropping rows with null author or hotel_location: (50240622, 13)


In [None]:
output_path = 'data/raw2/hotelrec_final.csv'
chunk_size = 500_000
write_header = True

for i in range(0, len(df_cleaned), chunk_size):
    chunk = df_cleaned.iloc[i:i+chunk_size].reset_index(drop=True)
    chunk.to_csv(output_path, mode='a', index=False, header=write_header)
    write_header = False
    print(f"Saved chunk {i//chunk_size + 1}: {chunk.shape[0]} rows") # 3m 14s

Saved chunk 1: 500000 rows
Saved chunk 2: 500000 rows
Saved chunk 3: 500000 rows
Saved chunk 4: 500000 rows
Saved chunk 5: 500000 rows
Saved chunk 6: 500000 rows
Saved chunk 7: 500000 rows
Saved chunk 8: 500000 rows
Saved chunk 9: 500000 rows
Saved chunk 10: 500000 rows
Saved chunk 11: 500000 rows
Saved chunk 12: 500000 rows
Saved chunk 13: 500000 rows
Saved chunk 14: 500000 rows
Saved chunk 15: 500000 rows
Saved chunk 16: 500000 rows
Saved chunk 17: 500000 rows
Saved chunk 18: 500000 rows
Saved chunk 19: 500000 rows
Saved chunk 20: 500000 rows
Saved chunk 21: 500000 rows
Saved chunk 22: 500000 rows
Saved chunk 23: 500000 rows
Saved chunk 24: 500000 rows
Saved chunk 25: 500000 rows
Saved chunk 26: 500000 rows
Saved chunk 27: 500000 rows
Saved chunk 28: 500000 rows
Saved chunk 29: 500000 rows
Saved chunk 30: 500000 rows
Saved chunk 31: 500000 rows
Saved chunk 32: 500000 rows
Saved chunk 33: 500000 rows
Saved chunk 34: 500000 rows
Saved chunk 35: 500000 rows
Saved chunk 36: 500000 rows
S

In [8]:
df_cleaned.head

<bound method NDFrame.head of           hotel_id                            hotel_name  \
0          1121769                          Hotel Baltic   
1          1121769                          Hotel Baltic   
2          1121769                          Hotel Baltic   
3          1121769                          Hotel Baltic   
4          1121769                          Hotel Baltic   
...            ...                                   ...   
50264526    240221  Chart House Suites On Clearwater Bay   
50264527    240221  Chart House Suites On Clearwater Bay   
50264528    240221  Chart House Suites On Clearwater Bay   
50264529    240221  Chart House Suites On Clearwater Bay   
50264530    240221  Chart House Suites On Clearwater Bay   

                                 hotel_location             author     date  \
0         Giulianova Province of Teramo Abruzzo       violettaf340  2019-01   
1         Giulianova Province of Teramo Abruzzo          Lagaiuzza  2016-01   
2         Gi

In [None]:
# Compress
df.to_csv('data/raw2/hotelrec_final.csv.gz', index=False, compression='gzip') #7m 39s