In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

sys.path.append(str(Path().resolve().parent))

from scripts.preprocessing import load_raw_reviews, preprocess_reviews, save_clean_reviews

In [None]:



#Load raw reviews
df_raw = load_raw_reviews()

# Preprocess
df_clean = preprocess_reviews(df_raw)

# Save cleaned data
save_clean_reviews(df_clean)

# Preview cleaned data
df_clean.head()


In [None]:
# Set visualization style
sns.set(style='whitegrid')

# Distribution of ratings
plt.figure(figsize=(8, 4))
sns.countplot(data=df_clean, x='rating', palette='viridis')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Number of Reviews')
plt.show()

# Reviews per bank
plt.figure(figsize=(8, 4))
sns.countplot(data=df_clean, x='bank', palette='Set2')
plt.title('Number of Reviews per Bank')
plt.xlabel('Bank')
plt.ylabel('Number of Reviews')
plt.show()


In [None]:
# Check total and missing percentages
total = len(df_clean)
missing_percentage = df_clean.isnull().mean().max() * 100

print(f"✅ Total cleaned reviews: {total}")
print(f"🔍 Missing data <5%: {'✅' if missing_percentage < 5 else '❌'} ({missing_percentage:.2f}%)")
