In [None]:
import pandas as pd
import re
from collections import Counter
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError
import matplotlib.pyplot as plt

# Define your connection string
engine = create_engine('oracle+oracledb://root:password@localhost:1521/?service_name=FREEPDB1')  

try:
    with engine.connect() as connection:
        # Fetch all restaurant names
        query = "SELECT name_val FROM MERCHANT"
        df = pd.read_sql(query, con=connection)

    # ✅ Debug: Print first few names
    print("Raw Data from Database:")
    print(df.head())

    # Ensure column exists and is not empty
    if df.empty or 'name_val' not in df.columns:
        print("⚠️ No restaurant names found or 'name_val' column is missing!")
    else:
        # Clean and strip any leading/trailing spaces from restaurant names
        df['name_val'] = df['name_val'].str.strip()

        # Drop rows where name_val is still empty or null after cleaning
        df = df[df['name_val'].notna() & (df['name_val'] != '')]

        # Check if we now have valid data
        if df.empty:
            print("⚠️ No valid restaurant names found after cleaning!")
        else:
            # Define a list of known invalid words (e.g., placeholders, test data)
            invalid_keywords = ['test', 'placeholder', 'empty', 'admin', 'sample', 'restaurant', 'location']

            # Filter out rows that contain invalid keywords
            df = df[~df['name_val'].str.lower().str.contains('|'.join(invalid_keywords))]

            # Filter out names that are too short or too long (adjust lengths as needed)
            df = df[df['name_val'].apply(lambda x: 3 < len(x.split()) < 5)]  # Example: 3-4 word chains

            # Clean restaurant names using regex (remove non-alphabetical characters)
            df['name_val'] = df['name_val'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

            # Extract restaurant chains (assuming the chain is the first word or words in the name)
            chain_list = []
            for name in df['name_val']:
                # Extract the first word(s) as the restaurant chain name
                chain_name = re.findall(r'^[\w]+(?: [\w]+)*', name.lower())  # This will grab the first "word" or "words"
                if chain_name:
                    chain_list.append(chain_name[0])  # Add the chain name to the list

            # Count the frequency of each restaurant chain
            chain_counts = Counter(chain_list)
            print("\nTop 10 most popular restaurant chains")

            # Get the top 10 most frequent chains
            top_chains = chain_counts.most_common(10)

            # Convert to DataFrame for better visualization
            df_chains = pd.DataFrame(top_chains, columns=['Chain', 'Count'])

            # ✅ Debug: Print chain frequenciesins:")
            print(df_chains)

            # 🔹 Bar Graph
            plt.figure(figsize=(10, 6))
            plt.bar(df_chains['Chain'], df_chains['Count'], color='purple', edgecolor='black')
            plt.title('Top 10 Most Popular Restaurant Chains')
            plt.xlabel('Restaurant Chain')
            plt.ylabel('Count')
            plt.xticks(rotation=45, ha='right')
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            plt.show()

except SQLAlchemyError as e:
    print(f"❌ Database Error: {e}")
