# 1. Setup and Imports

In [1]:
import os
import sys
import pandas as pd

# Add src/ to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from utils import log_step

# 2. Load Raw Data (from CSV or Oracle)

In [2]:
log_step("Loading scraped reviews...")
df_raw = pd.read_csv("../data/bank_reviews_raw.csv")
df_raw = df_raw.rename(columns={"date": "review_date"})
df_raw.head()

[2025-06-07 16:45:53] 🔹 Loading scraped reviews...


Unnamed: 0,review,rating,review_date,app_name
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,Commercial Bank of Ethiopia
1,what is this app problem???,1,2025-06-05,Commercial Bank of Ethiopia
2,the app is proactive and a good connections.,5,2025-06-05,Commercial Bank of Ethiopia
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,Commercial Bank of Ethiopia
4,good,4,2025-06-05,Commercial Bank of Ethiopia


# 3. Remove Duplicates and Nulls

In [3]:
log_step("Removing duplicates and missing values...")
initial_shape = df_raw.shape

df_cleaned = df_raw.drop_duplicates(subset=["review", "app_name"])
df_cleaned.dropna(subset=["review", "rating", "review_date"], inplace=True)

log_step(f"Removed {initial_shape[0] - df_cleaned.shape[0]} rows")

[2025-06-07 16:45:53] 🔹 Removing duplicates and missing values...
[2025-06-07 16:45:53] 🔹 Removed 182 rows


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.dropna(subset=["review", "rating", "review_date"], inplace=True)


# 4. Normalize Date Format

In [4]:
log_step("Normalizing date format...")
df_cleaned["review_date"] = pd.to_datetime(df_cleaned["review_date"]).dt.date  # YYYY-MM-DD

[2025-06-07 16:45:53] 🔹 Normalizing date format...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["review_date"] = pd.to_datetime(df_cleaned["review_date"]).dt.date  # YYYY-MM-DD


# 5. Validate Data

In [5]:
log_step("Summary of cleaned data:")
print(df_cleaned["app_name"].value_counts())
print(df_cleaned["rating"].value_counts())

[2025-06-07 16:45:53] 🔹 Summary of cleaned data:
app_name
Commercial Bank of Ethiopia    367
Dashen Bank                    367
BoA Mobile                     351
Name: count, dtype: int64
rating
5    667
1    244
4     75
3     56
2     43
Name: count, dtype: int64


# 6. Save Cleaned Data

In [6]:
log_step("Saving cleaned data to CSV...")
df_cleaned.to_csv("../data/bank_reviews_cleaned.csv", index=False)

[2025-06-07 16:45:53] 🔹 Saving cleaned data to CSV...


# Save DataFrame to Oracle

In [7]:
from database import get_engine, insert_dataframe

engine = get_engine()

insert_dataframe(df_cleaned, table_name='bank_reviews_cleaned', engine=engine)

Data inserted successfully into 'bank_reviews_cleaned'.
