In [3]:
from google.cloud import bigquery
import pandas
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
def read_bigquery_table(project_id, dataset_id, table_id):

    client = bigquery.Client()
    table_ref = client.dataset(dataset_id).table(table_id)
    table = client.get_table(table_ref)

    return client.list_rows(table).to_dataframe()

def initial_stats_table(table):

    print("DataFrame Info:")
    print(table.info())

    print("Summary Statistics:")
    print(table.describe())

    print("Missing Values:")
    print(table.isnull().sum())

    print("Duplicate Rows:")
    print(table.duplicated().sum())

    print("Unique Values in Categorical Columns:")
    for col in table.select_dtypes(include='object').columns:

        print(f"{col}: {table[col].nunique()} unique values")

        if table[col].nunique() < 10:
            
            print(table[col].unique())

    plt.figure(figsize=(10, 6))
    sns.histplot(data=table, x='numerical_column', bins=30, kde=True)
    plt.title("Distribution of Numerical Column")
    plt.show()

    # Visualize relationships between numerical columns
    sns.pairplot(table)
    plt.title("Pairplot of Numerical Columns")
    plt.show()

    # Correlation heatmap of numerical columns
    plt.figure(figsize=(10, 8))
    sns.heatmap(table.corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Heatmap of Numerical Columns")
    plt.show()

In [5]:
table = read_bigquery_table("ww-da-ingestion", "v_extract1", "bookings")

In [None]:
initial_stats_table(table)