# Create and/or add data to tvde_earnings_history table in the bronze layer

In [None]:
# Importing libraries
import os
import tkinter as tk
from tkinter import filedialog
import pandas as pd
import re
from datetime import datetime
from pyspark.sql import SparkSession
#from pyspark.sql.functions import col, lit, regexp_extract, when, to_date, unix_timestamp, from_unixtime, concat_ws, lpad, create_map
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType



In [None]:
# layer and table
layer = "bronze"
table = "tvde_earnings_history"
catalog_name = "toll_reconciliation_tool"

# Define the desired warehouse location explicitly
project_dir = "C:/Users/renat/Documents/imgPdados-finance-uber/toll-reconciliation-tool/spark-warehouse"
warehouse_location = "file:///" + os.path.abspath(project_dir)
print(f"Setting Spark warehouse to: {warehouse_location}")

# Change the current working directory
os.chdir(project_dir)
print(f"Changed current working directory to: {os.getcwd()}")

In [None]:
# --- Functions (select_files, add_files_to_list, normalize_dataframe) ---
def select_files():
    """Opens a file selection dialog and returns a list with the paths of the selected files."""
    root = tk.Tk()  # Create a Tkinter window
    root.withdraw()  # Hide the main window

    file_paths = filedialog.askopenfilenames(
        title="Select files",  # Window title
    )

    return list(file_paths)  # Convert the returned tuple to a list

def add_files_to_list(file_list):
    """Adds the paths of the selected files to the list."""
    selected_file_paths = select_files()

    if selected_file_paths:  # Check if the user selected any files
        file_list.extend(selected_file_paths)  # Add the paths to the list
        print("Files added:")
        for file_path in selected_file_paths:
            print(file_path)
    else:
        print("No files selected.")

def normalize_dataframe(df, filename):
    """Normalizes a DataFrame according to specified rules."""

    df['Date'] = None
    df['Earnings'] = None
    df['Toll'] = None
    df['ServiceFee'] = None
    df['Tip'] = None
    df['StartTime'] = None
    df['TotalTime'] = None
    df['Distance'] = None
    df['DataInput'] = None

    toll_words = ["pedágio"]
    service_fee_words = ["taxa de serviço"]
    tip_words = ["valor extra"]

    # Extract year and month from filename
    year_match = re.search(r'\d{4}', filename)
    month_match = re.search(r'(jan|fev|mar|abr|mai|jun|jul|ago|set|out|nov|dez)', filename, re.IGNORECASE)

    if year_match and month_match:
        year = year_match.group(0)
        month = month_match.group(0).lower()
        month_number = {
            'jan': '01', 'fev': '02', 'mar': '03', 'abr': '04', 'mai': '05', 'jun': '06',
            'jul': '07', 'ago': '08', 'set': '09', 'out': '10', 'nov': '11', 'dez': '12'
        }[month]
    else:
        year = None
        month_number = None

    for index, row in df.iterrows():
        text = row['Extracted Text']

        # include control columns
        df.at[index, 'DataInput'] = datetime.now()

        # Date
        date_match = re.search(r'(seg|ter|qua|qui|sex|sáb|dom|Mon|Tue|Wed|Thu|Fri|Sat|Sun)?,? \d{2} de [a-z]{3}', text, re.IGNORECASE)
        if date_match:
            day_month = re.search(r'\d{2} de [a-z]{3}', date_match.group(0), re.IGNORECASE).group(0)
            day = re.search(r'\d{2}', day_month).group(0)
            if year and month_number:
                df.at[index, 'Date'] = f"{year}/{month_number}/{day}"
            else:
                df.at[index, 'Date'] = f"{day}" 
        else:
            df.at[index, 'Date'] = f'{year}/{month_number}/{day}'


        # Earnings
        earnings_match = re.search(r'€\s*([\d,.]+)', text)
        if earnings_match:
            df.at[index, 'Earnings'] = earnings_match.group(1).replace(',', '.')

        # Toll
        for word in toll_words:
            toll_match = re.search(r'([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if toll_match:
                df.at[index, 'Toll'] = toll_match.group(1).replace(',', '.')
                break

        # Service Fee
        for word in service_fee_words:
            service_match = re.search(r'€\s*([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if service_match:
                df.at[index, 'ServiceFee'] = service_match.group(1).replace(',', '.')
                break

        # Tip
        for word in tip_words:
            tip_match = re.search(r'([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if tip_match:
                df.at[index, 'Tip'] = tip_match.group(1).replace(',', '.')
                break

        # Start Time
        start_time_match = re.search(r'(\d{2}-\d{2}|\d{2}\.\d{2}|\d{2}\*\d{2})', text)
        if start_time_match:
            df.at[index, 'StartTime'] = start_time_match.group(1).replace('.', ':').replace('*', ':').replace('-', ':')

        # Total Time
        total_time_match = re.search(r'(\d+)\s*min\s*(\d+)\s*seg', text)
        if total_time_match:
            df.at[index, 'TotalTime'] = f"{total_time_match.group(1)}:{total_time_match.group(2)}"

        # Distance
        distance_match = re.search(r'([\d,.]+)\s*km', text)
        if distance_match:
            df.at[index, 'Distance'] = distance_match.group(1).replace(',', '.')

    return df


In [None]:
# Selecting Files:
filepath_list = [] # Create an empty list
add_files_to_list(filepath_list) # Call the function to add files and print the final list
pandas_dataframes_list = []  # List to store DataFrames


In [None]:
# puts each df in the 'dataframes_list', one for each CSV file.
for filepath in filepath_list:
    try:
        df = pd.read_csv(filepath, encoding='utf-8')  # Use utf-8 encoding
        pandas_dataframes_list.append(df)
        print(f"Successfully read {filepath}")
    except FileNotFoundError:
        print(f"Error: File not found - {filepath}")
    except pd.errors.EmptyDataError:
        print(f"Error: Empty CSV file - {filepath}")
    except pd.errors.ParserError:
        print(f"Error: Parsing error in {filepath}. Check the file format.")
    except Exception as e:
        print(f"An unexpected error occurred while reading {filepath}: {e}")



# You can access each DataFrame like this:
if pandas_dataframes_list:  # Check if the list is not empty
    for i,df in enumerate(pandas_dataframes_list):
        print(f"Number of rows in {df.shape}")
        print(f"\nFirst few rows of the {i+1}th DataFrame:")
        display(df.head())  # Print the first few rows of the first DataFrame
else:
    print("\nNo CSV files were successfully read.")


In [None]:
# normalizing Pandas DataFrame 
normalized_pandas_dataframes_list = [normalize_dataframe(df.copy(), os.path.basename(filepath)) for df, filepath in zip(pandas_dataframes_list, filepath_list)]


# Now 'normalized_dataframes' contains the normalized DataFrames.
# Configure pandas display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Increase terminal width (adjust as needed)

# To view the first normalized DataFrame rows:
if normalized_pandas_dataframes_list:  # Check if the list is not empty
    for i,ndf in enumerate(normalized_pandas_dataframes_list):
        print(f"Number of rows in {ndf.shape}")
        print(f"\nFirst few rows of the {i+1}th DataFrame:")
        display(ndf.head())  # Print the first few rows of each DataFrame
else:
    print("\nNo files were successfully read.")


In [None]:
# Convertendo os tipos de dados no Pandas explicitamente
for ndf in normalized_pandas_dataframes_list:
    numeric_cols = ['Earnings', 'Toll', 'ServiceFee', 'Tip', 'Distance']
    for col in numeric_cols:
        ndf[col] = pd.to_numeric(ndf[col], errors='coerce')
    ndf['DataInput'] = pd.to_datetime(ndf['DataInput'])
    string_cols = ['Extracted Text', 'Date', 'StartTime', 'TotalTime']
    for col in string_cols:
        ndf[col] = ndf[col].astype(str)
    print(ndf.dtypes) # Verifique os tipos após a conversão

In [None]:
# Initialize Spark session with the specified warehouse directory
spark = SparkSession.builder \
    .appName(catalog_name) \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()


# Confirming that the session is working
if spark:
    print("Spark session started successfully!")
    print(f"Application name: {spark.sparkContext.appName}")
else:
    print("Failed to start Spark session.")


In [None]:
# Attempt a basic Hive operation
try:
    # Show a list of layers/schemas/databases in the spark-warehouse
    spark.sql("SHOW DATABASES").show()
except Exception as e:
    print(f"Error: {e}")

# Check if the 'bronze' schema exists
result = spark.sql("SHOW SCHEMAS").collect()
schemas = [row[0] for row in result]

if "bronze" not in schemas:
    print("Error: The 'bronze' schema does not exist. Please create it.")  # error for not finding the bronze layer
else:
    print("The 'bronze' schema exists.")
    # Switch to the bronze layer
    spark.sql(f"USE {layer}")



In [None]:

# Convert each Pandas DataFrame in the list to a Spark DataFrame
spark_dataframes_list = [spark.createDataFrame(ndf) for ndf in normalized_pandas_dataframes_list]


In [None]:

# To view the first normalized Spark DataFrame rows:
if spark_dataframes_list:  # Check if the list is not empty
    for i, sdf in enumerate(spark_dataframes_list):
        print(f"Visualizando DataFrame número: {i + 1} da lista")
        print("\nPrimeiras 5 linhas:")
        sdf.show(5)
        print("-" * 30)  # Separador para melhor visualizaçã
else:
    print("\nNo files were successfully read.")

In [None]:

for sdf in spark_dataframes_list:
    try:
        # Write the normalized Spark DataFrame to a Delta table
        sdf.write \
            .format('delta') \
            .mode('append') \
            .option('mergeSchema', 'true') \
            .saveAsTable(f'{layer}.{table}')
        print(f"Successfully processed and loaded {filepath} into {layer}.{table}")
            
    except Exception as e:
        print(f"An error occurred while processing {filepath}: {e}")
        import traceback
        traceback.print_exc()

print(f"All {len(filepath_list)} files processed (attempted) and loaded into '{layer}.{table}'.")


In [0]:
%sql
DESC DETAIL bronze.tvde_earnings_history

In [0]:
%sql
select *
from bronze.tvde_earnings_history

In [None]:
# Stop the Spark session
spark.stop()