In [None]:
# Importing libraries
import sqlite3
import pandas as pd
import os
import tkinter as tk
from tkinter import filedialog
import re

In [None]:
# Functions
def select_files():
    """Opens a file selection dialog and returns a list with the paths of the selected files."""
    root = tk.Tk()  # Create a Tkinter window
    root.withdraw()  # Hide the main window

    file_paths = filedialog.askopenfilenames(
        title="Select files",  # Window title
    )

    return list(file_paths)  # Convert the returned tuple to a list


def add_files_to_list(file_list):
    """Adds the paths of the selected files to the list."""
    selected_file_paths = select_files()

    if selected_file_paths:  # Check if the user selected any files
        file_list.extend(selected_file_paths)  # Add the paths to the list
        print("Files added:")
        for file_path in selected_file_paths:
            print(file_path)
    else:
        print("No files selected.")

def normalize_dataframe(df, filename):
    """Normalizes a DataFrame according to specified rules."""

    df['Date'] = None
    df['Earnings'] = None
    df['Toll'] = None
    df['ServiceFee'] = None
    df['Tip'] = None
    df['StartTime'] = None
    df['TotalTime'] = None
    df['Distance'] = None

    current_date = None

    toll_words = ["pedágio"]
    service_fee_words = ["taxa de serviço"]
    tip_words = ["valor extra"]

    # Extract year and month from filename
    year_match = re.search(r'\d{4}', filename)
    month_match = re.search(r'(jan|fev|mar|abr|mai|jun|jul|ago|set|out|nov|dez)', filename, re.IGNORECASE)

    if year_match and month_match:
        year = year_match.group(0)
        month = month_match.group(0).lower()
        month_number = {
            'jan': '01', 'fev': '02', 'mar': '03', 'abr': '04', 'mai': '05', 'jun': '06',
            'jul': '07', 'ago': '08', 'set': '09', 'out': '10', 'nov': '11', 'dez': '12'
        }[month]
    else:
        year = None
        month_number = None

    for index, row in df.iterrows():
        text = row['Extracted Text']

        # Date
        date_match = re.search(r'(seg|ter|qua|qui|sex|sáb|dom|Mon|Tue|Wed|Thu|Fri|Sat|Sun)?,? \d{2} de [a-z]{3}', text, re.IGNORECASE)
        if date_match:
            day_month = re.search(r'\d{2} de [a-z]{3}', date_match.group(0), re.IGNORECASE).group(0)
            day = re.search(r'\d{2}', day_month).group(0)
            if year and month_number:
                df.at[index, 'Date'] = f"{year}/{month_number}/{day}"
            else:
                df.at[index, 'Date'] = f"{day}" 
        else:
            df.at[index, 'Date'] = f'{year}/{month_number}/{day}'


        # Earnings
        earnings_match = re.search(r'€\s*([\d,.]+)', text)
        if earnings_match:
            df.at[index, 'Earnings'] = earnings_match.group(1).replace(',', '.')

        # Toll
        for word in toll_words:
            toll_match = re.search(r'([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if toll_match:
                df.at[index, 'Toll'] = toll_match.group(1).replace(',', '.')
                break

        # Service Fee
        for word in service_fee_words:
            service_match = re.search(r'€\s*([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if service_match:
                df.at[index, 'ServiceFee'] = service_match.group(1).replace(',', '.')
                break

        # Tip
        for word in tip_words:
            tip_match = re.search(r'([\d,.]+)\s*' + word, text, re.IGNORECASE)
            if tip_match:
                df.at[index, 'Tip'] = tip_match.group(1).replace(',', '.')
                break

        # Start Time
        start_time_match = re.search(r'(\d{2}-\d{2}|\d{2}\.\d{2}|\d{2}\*\d{2})', text)
        if start_time_match:
            df.at[index, 'StartTime'] = start_time_match.group(1).replace('.', ':').replace('*', ':').replace('-', ':')

        # Total Time
        total_time_match = re.search(r'(\d+)\s*min\s*(\d+)\s*seg', text)
        if total_time_match:
            df.at[index, 'TotalTime'] = f"{total_time_match.group(1)}:{total_time_match.group(2)}"

        # Distance
        distance_match = re.search(r'([\d,.]+)\s*km', text)
        if distance_match:
            df.at[index, 'Distance'] = distance_match.group(1).replace(',', '.')

    return df

In [5]:
# Selecting  Files:
my_list = []  # Create an empty list
add_files_to_list(my_list)  # Call the function to add files and print the final list
dataframes = []  # List to store DataFrames

for filepath in my_list:
    try:
        df = pd.read_csv(filepath, encoding='utf-8')  # Use utf-8 encoding
        dataframes.append(df)
        print(f"Successfully read {filepath}")
    except FileNotFoundError:
        print(f"Error: File not found - {filepath}")
    except pd.errors.EmptyDataError:
        print(f"Error: Empty CSV file - {filepath}")
    except pd.errors.ParserError:
        print(f"Error: Parsing error in {filepath}. Check the file format.")
    except Exception as e:
        print(f"An unexpected error occurred while reading {filepath}: {e}")

# Now 'dataframes' contains a list of DataFrames, one for each CSV file.

# You can access each DataFrame like this:
if dataframes:  # Check if the list is not empty
    print("\nFirst few rows of the first DataFrame:")
    print(dataframes[0].head())  # Print the first few rows of the first DataFrame
else:
    print("\nNo CSV files were successfully read.")

Files added:
C:/Users/renat/Documents/imgPdados-finance-uber/uber-2023-dezembro.csv
Successfully read C:/Users/renat/Documents/imgPdados-finance-uber/uber-2023-dezembro.csv

First few rows of the first DataFrame:
                                      Extracted Text
0                                    sex , 29 de dez
1                       € 2,60 1l-46 Taxa de serviço
2  €3,23 21-23 UberX 6 min 22 segundos 138 km 4 €...
3  €5,22 UberX Saver . 18 min 46 segundos 20-12 8...
4  €4,37 UberX Saver ' 18 min 1l segundos 19.45 6...


In [19]:
# normalizing
normalized_dataframes = [normalize_dataframe(df.copy(), os.path.basename(filepath)) for df, filepath in zip(dataframes, my_list)]

# Now 'normalized_dataframes' contains the normalized DataFrames.
# Configure pandas display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Increase terminal width (adjust as needed)

# To view the first normalized DataFrame rows:
if normalized_dataframes:
    print(normalized_dataframes[0].head(40)) 

                                       Extracted Text        Date Earnings  Toll ServiceFee   Tip StartTime TotalTime Distance
0                                     sex , 29 de dez  2023/12/29     None  None       None  None      None      None     None
1                        € 2,60 1l-46 Taxa de serviço  2023/12/29     2.60  None       None  None      None      None     None
2   €3,23 21-23 UberX 6 min 22 segundos 138 km 4 €...  2023/12/29     3.23  None       None  0.50     21:23      6:22      138
3   €5,22 UberX Saver . 18 min 46 segundos 20-12 8...  2023/12/29     5.22  None       None  None     20:12     18:46     8.22
4   €4,37 UberX Saver ' 18 min 1l segundos 19.45 6...  2023/12/29     4.37  None       None  None     19:45      None     6.89
5   €0,00 19.37 UberX Saver 0 usuario cancelou rrá...  2023/12/29     0.00  None       None  None     19:37      None     None
6   €4,88 UberX Saver . 16 min 58 segundos 19.26 6...  2023/12/29     4.88  None       None  None     19:26    