# L3 Data Visualization Notebook

This report comprises the entire Data Visualization of the L3 data (Level 3 Data)  of the Hotel Recommendation system built. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
import boto3

In [None]:
# Set up boto3 client (this step is optional and not necessary for using s3fs directly)
s3_client = boto3.client('s3', region_name='us-west-2')

# Define bucket and object key
bucket_name = 'andorra-hotels-data-warehouse'
object_key = 'l3_data/text/lsth.parquet'

# Construct the S3 file path
s3_file_path = f's3://{bucket_name}/{object_key}'

# Load the Parquet file into a pandas DataFrame
df = pd.read_parquet(s3_file_path, engine='pyarrow', storage_options={'anon': False})

# Display the first few rows of the DataFrame
df_head = df.head()
df_head.style.hide()

## 1. NLP: Review Text Features

The plot that visualizes the distribution of values for each feature extracted from the `review_text_features` column in the dataset. 
Each subplot represents the distribution of one of the features across all reviews, showing how frequently each value occurs.

In [None]:
# Extract the review text features
def extract_review_text_features(row):
    try:
        features = row['review_text_features']
        return features['values']
    except (TypeError, KeyError):
        return []

df['review_text_values'] = df.apply(extract_review_text_features, axis=1)

# Create a DataFrame for review text features
review_text_df = pd.DataFrame(df['review_text_values'].tolist())

# Plot the distribution of each feature
if not review_text_df.empty:
    num_features = review_text_df.shape[1]
    num_cols = 4
    num_rows = (num_features // num_cols) + 1

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))

    # Flatten axes for easy iteration
    axes = axes.flatten()

    for i, column in enumerate(review_text_df.columns):
        sns.histplot(review_text_df[column], bins=20, kde=True, ax=axes[i])
        axes[i].set_title(f'Feature {i+1}')
        axes[i].set_xlabel('Value')
        axes[i].set_ylabel('Frequency')

    # Remove empty subplots
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()
else:
    print("No data available in review_text_df")