### Helper Functions

#### This notebook gives the commone excel functions used in the transformations. It will be called in other notebooks

In [2]:
def function_name_decorator(func):
    def wrapper(*args, **kwargs):
        print(f"FUNCTION CALLED: {func.__name__}")
        result = func(*args, **kwargs)
        print(f"FUNCTION EXECUTION ENDED: {func.__name__}")
        return result

    return wrapper

In [16]:
@function_name_decorator
def read_report_file(file_path):
    
    print(f"==READING IN THE FILE {file_path}")
    # Read only the first two rows of the CSV or EXCEL file
    
    
    try:
        df_preview = pd.read_csv(file_path, nrows=2)
        print("Format: CSV")
    
    except ValueError as e:
        print("Format: Excel")
        df_preview = pd.read_excel(file_path, nrows = 2, engine = "openpyxl")
        
    except FileNotFoundError as e:
        print(f"\nFile with filename: {file_path} is not in the path.\nPython error: {e}")
        df_preview = ""

        
    print("STEP 1: Checking if the file has a header or not")
    # Check if the first row seems like a header. If it is a header, has_no_header will be false, else, it will be true
    try:
        has_no_header = pd.Series(df_preview.columns).str.isnumeric().any()
    
    except AttributeError as e:
        print(e)
        print(f"This error is as a result of the file with filename {file_path} not being found")
        has_no_header = "NA"

    if has_no_header == True:
        print("STEP 2: File has no header")
        try:
            df = pd.read_csv(file_path, header = None)
        
        except ValueError as e:
            df = pd.read_excel(file_path, engine = "openpyxl")

        
    elif has_no_header == False:
        print(f"STEP 2: File has headers. They are: {df_preview.columns}")
        try:
            df = pd.read_csv(file_path)
        
        except ValueError as e:
            df = pd.read_excel(file_path, engine = "openpyxl")
    
    elif has_no_header == "NA":
        warnings.warn(f"File with name {file_path} could not be read")
        df = ""
    #print("==END OF FUNCTION: READ REPORT FILE==\n")
    return df, has_no_header

In [3]:
@function_name_decorator
def get_latest_folders(main_folder_path):
    print("==GETTING THE LATEST FOLDER PATH==")
    
    print("STEP 1: Creating a list for holding the various components of a file path")
    component_list = []
    
    print(f"STEP 2: Looping through each folder in the path: {main_folder_path}. We extract the latest day folder")
    for file_path in os.listdir(main_folder_path):
        # Split the path into components
        component = file_path.split('/')[0]
        component = int(component)
        component_list.append(component)
        latest_folder = max(component_list)
    
    print(f"STEP 3: Latest folder is: {latest_folder}")
    #print("==END OF FUNCTION GET LATEST FOLDERS ")
    return str(latest_folder) + "/"

In [4]:
@function_name_decorator
def excel_delete(dframe, cols):
    
    """
    Delete specified columns from a DataFrame and create new columns indicating deletion.
    It replicates excel's delete functionality.

    Parameters:
    - dframe (pd.DataFrame): The DataFrame from which columns will be deleted.
    - cols (list): List of column names to be deleted.

    Returns:
    - pd.DataFrame: The modified DataFrame with deletion indicators.
    """
    
    df = dframe.copy()

    columns = list(df.columns)

    df = df.drop(cols, axis = 1)

    for col in cols:
        df[f"deleted{col}"] = ["shifted left"]*len(df)

    df.columns = columns
    
    #print("==END OF FUNCTION: EXCEL DELETE==\n")
    return df

In [5]:
@function_name_decorator
def excel_move_column(dframe, moved_col, shifted_col):
    
    """
    Move a column to a specified position in a DataFrame and create a new DataFrame.
    It replicates excel's column move functionality

    Parameters:
    - dframe (pd.DataFrame): The DataFrame from which a column will be moved.
    - moved_col (str): The column to be moved.
    - shifted_col (str): The column to indicate the new position.

    Returns:
    - pd.DataFrame: The modified DataFrame with the moved column.
    """
    
    
    df = dframe.copy()

    columns = list(df.columns)
    columns_copy = columns.copy()

    #shifted_col_mask = df[empty_col].str.lower() == "shifted left"

    shifted_col_idx = columns.index(shifted_col)

    columns.remove(moved_col)

    columns.insert(shifted_col_idx, moved_col)

    df = df[columns]

    df.columns = columns_copy
    #print("==END OF FUNCTION: EXCEL MOVE COLUMN==\n")
    return df

In [6]:
@function_name_decorator
def concat_cols(dframe, *cols):
    
    """
    Concatenate specified columns in a DataFrame and handle empty columns.

    Parameters:
    - dframe (pd.DataFrame): The DataFrame in which columns will be concatenated.
    - cols (str): Columns to be concatenated.

    Returns:
    - pd.DataFrame: The modified DataFrame with concatenated columns.
    """
    
    df = dframe.copy()
    concat_col_found = False
    for col in df.columns:
        
        if df[col].isnull().sum() == len(df):
            print(f"Found an empty column! Column {col}")
            concat_col = col
            
            break
            
        else:
            continue
            
    for chosen_col in cols:
        if all(df[concat_col].isnull()):
            df.loc[:, concat_col] = df[chosen_col]       
            
        else:
            df.loc[:, concat_col] += df[chosen_col]       
                
        print(f"Column {chosen_col} has been concatenated")
                    
        
        
        print(df[concat_col])
        
    #print("==END OF FUNCTION: CONCAT COLS==\n")
    return df, col

In [7]:
@function_name_decorator
def divide_column(dframe, divisor, *divided_cols):
    df = dframe.copy()
    
    for col in divided_cols:
        df[col] = np.where(df[col].astype(float) == 0, "-",
                           df[col].astype(float)/divisor)
    
   # print("==END OF FUNCTION: DIVIDE COLUMN==\n")
    return df

In [8]:
@function_name_decorator
def excel_vlookup(maintable, lookuptable, main_col, lookup_col, value_col, destination_col):
    
    """
    Perform a VLOOKUP operation on two DataFrames and insert values into a specified column.

    Parameters:
    - maintable (pd.DataFrame): The main DataFrame to be modified.
    - lookuptable (pd.DataFrame): The DataFrame used for VLOOKUP.
    - main_col (str): The column in the main DataFrame used for matching.
    - lookup_col (str): The column in the lookup DataFrame used for matching.
    - value_col (str): The column in the lookup DataFrame containing values to be inserted.
    - destination_col (str): The column in the main DataFrame where values will be inserted.

    Returns:
    - pd.DataFrame: The modified main DataFrame.
    """

    main_table = maintable.copy()
    lookup_table = lookuptable.copy()

    lookup_mask = lookup_table[lookup_col].isin(main_table[main_col].to_list())
    lookup_result = lookup_table.loc[lookup_mask, :][[lookup_col, value_col]]
    lookup_series = lookup_result.set_index(lookup_col)
    lookup_dict = lookup_series.to_dict()
    lookup_dict = lookup_dict[value_col]
    #print(lookup_dict)
    main_table_cols = list(main_table.columns)
    #print(main_table_cols)
    #print(destination_col)
    destination_idx = main_table_cols.index(destination_col)
    #print(destination_idx)
    remainder_cols = main_table_cols[destination_idx:].copy()

    for col in remainder_cols:
        shifted_mask = main_table[col] == "shifted left"
        if main_table[col].isnull().sum() == len(main_table) or len(main_table.loc[shifted_mask, :]) == len(main_table):
            main_table[col] = main_table[main_col].map(lookup_dict)
            print(f"Lookup values inserted in Column {col}")
            break

        else:
            print(f"Destination column chosen, Column {destination_col} is not empty. Searching for an empty column")
    
    #print("==END OF FUNCTION: EXCEL VLOOKUP==\n")
    return main_table


In [9]:
@function_name_decorator
def dataframe_update(dframe, updater_col, updated_col, overwrite = False):
    
    """
    Update a DataFrame column based on another column, handling missing values.

    Parameters:
    - dframe (pd.DataFrame): The DataFrame to be updated.
    - updater_col (str or pd.core.frame.Series): The column used for updating.
    - updated_col (str): The column to be updated.
    - overwrite (bool): If True, overwrite the existing values in the updated column.

    Returns:
    - pd.DataFrame: The updated DataFrame.
    """

    
    df = dframe.copy()
    
    if isinstance(updater_col, str):
        if not overwrite:
            df.loc[:, updated_col] = df[updated_col].replace("-", np.nan).combine_first(df[updater_col])

        else:

            overwrite_col, overwritten_col = updater_col, updated_col
            df.loc[:, overwritten_col] = df[overwrite_col].replace("-", np.nan).combine_first(df[overwritten_col])
            
    elif isinstance(updater_col, pd.core.frame.Series):
        if not overwrite:
            df.loc[:, updated_col] = df[updated_col].replace("-", np.nan).combine_first(updater_col)

        else:

            overwrite_col, overwritten_col = updater_col, updated_col
            df.loc[:, overwritten_col] = overwrite_col.replace("-", np.nan).combine_first(df[overwritten_col])
    
   # print("==END OF FUNCTION: DATAFRAME UPDATE==\n")
    return df


In [10]:
@function_name_decorator
def _generate_excel_columns(n):
    result = []
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        result.append(chr(65 + remainder))
    #print("==END OF FUNCTION: GENERATE EXCEL COLUMNS==\n")
    return ''.join(reversed(result))



@function_name_decorator
def _create_inner_columns(dframe, num_cols, insertion_col):
    df = dframe.copy()
    
    columns = list(df.columns)
    
    for i, num in enumerate(range(len(columns)+1, len(columns) + num_cols + 1)):
        letter_col = _generate_excel_columns(num)
        columns.append(letter_col)
        
        if i > 0:
            insertion_idx = columns.index(insertion_col)
            try:
                insertion_col = columns[insertion_idx + 1]
                
            except IndexError as e:
                print(f"Expected index error:\n{e}")
                break
            
        df[letter_col] = 0
    #print("==END OF FUNCTION: CREATE INNER COLUMNS==\n")    
    return df



@function_name_decorator
def generate_inner_columns(dframe, num_col_spaces, insertion_col):
    df = dframe.copy()
    
    inner_df = _create_inner_columns(df, num_col_spaces, insertion_col)
    
    columns = list(inner_df.columns)
    
    
    inserted_cols = columns[-num_col_spaces:]
    
    insertion_idx = columns.index(insertion_col)
    shifted_cols = columns[insertion_idx: insertion_idx + num_col_spaces]
    
    new_order = columns[:insertion_idx] + inserted_cols + columns[insertion_idx: -num_col_spaces:]
    
    new_df = inner_df[new_order]
    
    new_df.columns = columns
   # print("==END OF FUNCTION: GENERATE INNER COLUMNS==\n")
    return new_df


In [5]:
@function_name_decorator
def convert_string_numbers(num, mute = False):
    
    """
    Convert a string representation of a number to a float.

    Parameters:
    - num (str): The string representation of the number.

    Returns:
    - Union[float, str]: The converted number or a message if conversion fails.
    """
    
    num = str(num)
    
    num = num.replace(",", "")
    
    try:
        if num == "nan":
            return "Not a number"
        num = float(num)
        
    except ValueError as e:
        if mute:
            return "Not a number"
        else:
            print(f"The string value you are trying to convert doesn't contain any numbers.\nPython error: {e}")
            return "Not a number"
        
    else:
        if mute:
            return num
        else:
            print(f"Number successfully converted. New format: {num}")
            return num
   # print("==END OF FUNCTION: CONVERT STRING NUMBERS==\n")
    return num


def _convert_string_numbers(num, mute = False):
    
    """
    Convert a string representation of a number to a float.

    Parameters:
    - num (str): The string representation of the number.

    Returns:
    - Union[float, str]: The converted number or a message if conversion fails.
    """
    print(f"CONVERTING THE STRING {num} TO A FLOAT NUMBER")
    num = str(num)
    
    num = num.replace(",", "")
    
    try:
        if num == "nan":
            return "Not a number"
        num = float(num)
        
    except ValueError as e:
        if mute:
            return "Not a number"
        else:
            print(f"The string value you are trying to convert doesn't contain any numbers.\nPython error: {e}")
            return "Not a number"
        
    else:
        if mute:
            return num
        else:
            print(f"Number successfully converted. New format: {num}")
            return num
   # print("==END OF FUNCTION: CONVERT STRING NUMBERS==\n")
    return num


In [12]:
@function_name_decorator
def _generate_alphabet_mapping(n):
    alphabet_mapping = {}
    for i in range(1, n + 1):
        alphabet_mapping[i] = chr(64 + i)
        
    #print("==END OF FUNCTION: GENERATE ALPHABET MAPPING==\n")
    return alphabet_mapping


In [13]:
@function_name_decorator
def add_letter_columns_to_df(dframe):
    df = dframe.copy()
    
    
    num_cols = len(df.columns)

    cols_dict = _generate_alphabet_mapping(num_cols)

    columns = list(cols_dict.values())

    df.columns = columns
    
    #print("==END OF FUNCTION: ADD LETTER COLUMNS TO DF==\n")
    return df


In [6]:
@function_name_decorator
def read_pdf(file_path, mute = False):
    import PyPDF2
    text = ""
    try:
        with open(file_path, 'rb') as file:
            print(file_path)
            pdf_reader = PyPDF2.PdfReader(file)

            # Get the total number of pages in the PDF
            num_pages = len(pdf_reader.pages)

            # Loop through each page and extract text
            
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
    
    except (FileNotFoundError) as e:
        if mute:
            pass
        else:
            print(f"\nAn error occurred.\nPython error: {e}")
        
        
    #print("==END OF FUNCTION: READ PDF==\n")
    return text