# Load_bronze_tvde.ipynb

In [None]:
# Importing libraries 
import pandas as pd
import os
import tkinter as tk
from tkinter import filedialog
import re
from datetime import datetime
from pyspark.sql import SparkSession



In [None]:
# layer and table
layer = "bronze"
table = "tvde_earnings_history"

In [None]:
# --- Functions (select_files, add_files_to_list, normalize_dataframe) ---

def select_files():
    """Opens a file selection dialog and returns a list with the paths of the selected files."""
    root = tk.Tk()  # Create a Tkinter window
    root.withdraw()  # Hide the main window

    file_paths = filedialog.askopenfilenames(
        title="Select files",  # Window title
    )

    return list(file_paths)  # Convert the returned tuple to a list


def add_files_to_list(file_list):
    """Adds the paths of the selected files to the list."""
    selected_file_paths = select_files()

    if selected_file_paths:  # Check if the user selected any files
        file_list.extend(selected_file_paths)  # Add the paths to the list
        print("Files added:")
        for file_path in selected_file_paths:
            print(file_path)
    else:
        print("No files selected.")

def normalize_dataframe(df, filename):
    """Normalizes a DataFrame according to specified rules."""

    df['Date'] = None
    df['Earnings'] = None
    df['Toll'] = None
    df['ServiceFee'] = None
    df['Tip'] = None
    df['StartTime'] = None
    df['TotalTime'] = None
    df['Distance'] = None
    df['DataInput'] = None

    toll_words = ["pedágio"]
    service_fee_words = ["taxa de serviço"]
    tip_words = ["valor extra"]

    # Extract year and month from filename
    year_match = re.search(r'\d{4}', filename)
    month_match = re.search(r'(jan|fev|mar|abr|mai|jun|jul|ago|set|out|nov|dez)', filename, re.IGNORECASE)

    if year_match and month_match:
        year = year_match.group(0)
        month = month_match.group(0).lower()
        month_number = {
            'jan': '01', 'fev': '02', 'mar': '03', 'abr': '04', 'mai': '05', 'jun': '06',
            'jul': '07', 'ago': '08', 'set': '09', 'out': '10', 'nov': '11', 'dez': '12'
        }[month]
    else:
        year = None
        month_number = None

    for index, row in df.iterrows():
        text = row['Extracted Text']

        # include control columns
        df.at[index, 'DataInput'] = datetime.now()

        # Date
        date_match = re.search(r'(seg|ter|qua|qui|sex|sáb|dom|Mon|Tue|Wed|Thu|Fri|Sat|Sun)?,? \d{2} de [a-z]{3}', text, re.IGNORECASE)
        if date_match:
            day_month = re.search(r'\d{2} de [a-z]{3}', date_match.group(0), re.IGNORECASE).group(0)
            day = re.search(r'\d{2}', day_month).group(0)
            if year and month_number:
                df.at[index, 'Date'] = f"{year}/{month_number}/{day}"
            else:
                df.at[index, 'Date'] = f"{day}"
        else:
            df.at[index, 'Date'] = f'{year}/{month_number}/{day}'


        # Earnings
        earnings_match = re.search(r'€\s*([\d,.]+)', text)
        if earnings_match:
            df.at[index, 'Earnings'] = earnings_match.group(1).replace(',', '.')

        # Toll
        for word in toll_words:
            toll_match = re.search(r'([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if toll_match:
                df.at[index, 'Toll'] = toll_match.group(1).replace(',', '.')
                break

        # Service Fee
        for word in service_fee_words:
            service_match = re.search(r'€\s*([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if service_match:
                df.at[index, 'ServiceFee'] = service_match.group(1).replace(',', '.')
                break

        # Tip
        for word in tip_words:
            tip_match = re.search(r'([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if tip_match:
                df.at[index, 'Tip'] = tip_match.group(1).replace(',', '.')
                break

        # Start Time
        start_time_match = re.search(r'(\d{2}-\d{2}|\d{2}\.\d{2}|\d{2}\*\d{2})', text)
        if start_time_match:
            df.at[index, 'StartTime'] = start_time_match.group(1).replace('.', ':').replace('*', ':').replace('-', ':')

        # Total Time
        total_time_match = re.search(r'(\d+)\s*min\s*(\d+)\s*seg', text)
        if total_time_match:
            df.at[index, 'TotalTime'] = f"{total_time_match.group(1)}:{total_time_match.group(2)}"

        # Distance
        distance_match = re.search(r'([\d,.]+)\s*km', text)
        if distance_match:
            df.at[index, 'Distance'] = distance_match.group(1).replace(',', '.')

    return df

In [None]:
# Initialize Spark session (if not already running)
spark = SparkSession.builder.appName("LoadBronzeData").getOrCreate()

# Switch to the bronze layer
spark.sql(f"USE {layer}")

#confirming
if spark:
    print("Sessão Spark iniciada com sucesso!")
    print(f"Nome da aplicação: {spark.sparkContext.appName}")
else:
    print("Falha ao iniciar a sessão Spark.")

In [0]:
# Selecting Files:
filepath_list = []  # Create an empty list
add_files_to_list(filepath_list)  # Call the function to add files and print the final list
dataframes_list = []  # List to store DataFrames

for filepath in filepath_list:
    try:
        df = pd.read_csv(filepath, encoding='utf-8')  # Use utf-8 encoding
        dataframes_list.append(df)
        print(f"Successfully read {filepath}")
    except FileNotFoundError:
        print(f"Error: File not found - {filepath}")
    except pd.errors.EmptyDataError:
        print(f"Error: Empty CSV file - {filepath}")
    except pd.errors.ParserError:
        print(f"Error: Parsing error in {filepath}. Check the file format.")
    except Exception as e:
        print(f"An unexpected error occurred while reading {filepath}: {e}")

# Now 'dataframes_list' contains a list of Pandas DataFrames.


# You can access each DataFrame like this:
if dataframes_list:  # Check if the list is not empty
    for i,df in enumerate(dataframes_list):
        print(f"Number of rows in {df.shape}")
        print(f"\nFirst few rows of the {i+1}th DataFrame:")
        display(df.head())  # Print the first few rows of each DataFrame
else:
    print("\nNo CSV files were successfully read.")

In [0]:
# Normalizing DataFrames
normalized_dataframes_list = [normalize_dataframe(df.copy(), os.path.basename(filepath)) for df, filepath in zip(dataframes_list, filepath_list)]

# Now 'normalized_dataframes' contains the normalized DataFrames.
# Configure pandas display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 700)  # Increase terminal width (adjust as needed)

# To view the first normalized DataFrame rows:
if normalized_dataframes_list:  # Check if the list is not empty
    for i,df in enumerate(normalized_dataframes_list):
        print(f"Number of rows in {df.shape}")
        print(f"\nFirst few rows of the {i+1}th DataFrame:")
        display(df.head())  # Print the first few rows of each DataFrame
else:
    print("\nNo files were successfully read.")


In [0]:
# From Pandas to Spark and then to SQL
# Convert each Pandas DataFrame in the list to a Spark DataFrame
spark_dataframes_list = [spark.createDataFrame(df) for df in normalized_dataframes_list]

# Saves data in Delta format as a table, appending for each DataFrame
for i, df in enumerate(spark_dataframes_list):
    if i == 0:
        df.write \
            .format('delta') \
            .mode('overwrite') \
            .option('mergeSchema', 'true') \
            .option('overwriteSchema', 'true') \
            .saveAsTable(f'{layer}.{table}')
        print(f"DataFrame {i+1} (first) loaded successfully into Spark table '{layer}.{table}' (overwritten).")
    else:
        df.write \
            .format('delta') \
            .mode('append') \
            .option('mergeSchema', 'true') \
            .saveAsTable(f'{layer}.{table}')
        print(f"DataFrame {i+1} appended successfully to Spark table '{layer}.{table}'.")

print(f"All {len(spark_dataframes_list)} DataFrames processed and loaded into '{layer}.{table}'.")

In [0]:
%sql
DESC DETAIL bronze.tvde_earnings_history

In [0]:
%sql
select *
from bronze.tvde_earnings_history

In [None]:
# Stop the Spark session
spark.stop()