In [1]:
import pandas as pd
import os
from modules.Date2 import find_starting_point, find_specific_date, convert_date
from modules.variables2 import *
from modules.Data_Quality import *
from modules.Cost_centre_structure import *

In [2]:
end_month = 'Apr'
end_year = '2023/24'

files_info =[]
dataframes = []

sheet_name = 'Leeds'
root = './'
input_folder = root+'Input files'
output_folder = root+'Prepared Files/'
combined_folder = root+'Combined files/'
directory = os.fsencode(input_folder)
num = 0

In [5]:
def process_file(file_path, division):
    global num
    xls = pd.ExcelFile(file_path)
    subregion_dict = division_to_subregion.get(division)

    if not subregion_dict:
        print(f"No subregion dictionary found for division {division}")
        return

    for sheet_name in xls.sheet_names:
        if any(sheet_name in cost_centres for cost_centres in subregion_dict.values()):
            num += 1
            # Load the data from the sheet, specifying no header to properly identify the row
            data = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
            start_month, start_year, row, start_col = find_starting_point(data)
            end_row, end_col = find_specific_date(data, end_month, end_year)
            dates = convert_date(data, row, start_col, end_col)
            print(division, sheet_name)
            extracted_data = extract_var_columns(data, start_col, end_col)
            extracted_data, region, division, cost_centre = get_metadata(extracted_data, division, sheet_name)
            final_df = pd.DataFrame({'Date': dates})
            final_df = pd.concat([final_df, extracted_data.reset_index(drop=True)], axis=1)
            num_col = len(final_df.columns)
            has_missing, rows_with_missing = check_missing_values(final_df, exclude_column)
            has_secured_missing = check_missing_secured(final_df)
            duplicated = check_duplicates(final_df)
            incorrect_relationship = check_relationships(final_df)

            var_name = f"{os.path.basename(file_path)}_{sheet_name}_missing"
            globals()[var_name] = rows_with_missing

            file_info = {'file_name': os.path.basename(file_path).split('.')[0],
                         'num': num,
                         'region': region,
                         'division': division,
                         'cost_centre': cost_centre,
                         'num_col': num_col,
                         'Start month': start_month,
                         'Start year': start_year,
                         'row': row,
                         'End column': end_col,
                         'missing': has_missing,
                         'secured missing': has_secured_missing,
                         'duplicated': duplicated,
                         'incorrect_relationship': incorrect_relationship}

            files_info.append(file_info)
            dataframes.append(final_df)
            output_file = f"{division}_{sheet_name.replace(' ', '_')}"
            prepared_file_path = os.path.join(output_folder, f'{output_file}_ok.csv')
            final_df.to_csv(prepared_file_path, index=False)

In [6]:
# Main loop to process all files
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".xlsx") or filename.endswith(".xls") or filename.endswith(".xlsm"):
        file_path = os.path.join(input_folder, filename)

        # Extract division from filename
        base_name = os.path.splitext(os.path.basename(file_path))[0]
        name_parts = base_name.split('-')
        if len(name_parts) > 1:
            division = name_parts[1].strip().lower()
            process_file(file_path, division)

files_info_df = pd.DataFrame(files_info)
files_info_df.to_csv('files_info.csv', index=False)

advisory AFM
advisory Pro
advisory R&H
advisory NZ
advisory Proc
advisory Disp
advisory SHQUK
advisory SHQTel
advisory Suiko
advisory AdvCo
amcl Group
amcl UK&I
amcl AMA
amcl Vertex
amcl IES
cost management Edinburgh
cost management Glasgow
cost management Belfast
cost management NorthEast
cost management Leeds
cost management NorthWest
cost management Sheffield
cost management Birmingham
cost management Nottingham
cost management Bristol
cost management Cambridge
cost management CLGov
cost management HE
cost management Occupier
cost management HiTech
cost management Developer
cost management alinea
cost management Co
infrastructure CM Scotland
infrastructure PM Scotland
infrastructure PMO Scotland
infrastructure CM NW
infrastructure PM NW
infrastructure PMO NW
infrastructure Def Nth
infrastructure CM YNE
infrastructure PM YNE
infrastructure PMO YNE
infrastructure CM Midlands
infrastructure PM Midlands
infrastructure PMO Midlands
infrastructure CM SE
infrastructure PM SE
infrastructure

In [7]:
len(dataframes)

79

In [14]:
excluded_divisions = ['amcl', 'advisory']  # Add the names of divisions to exclude
excluded_cost_centres = ['PM_RE_Digital', 'CM_alinea', 'INF_Def_Nth', 'INF_Def_SE','INF_T&U_SW',
                         'INF_Def_CM_SW','INF_Def_P3M_SW', 'INF_Digital', 'PM_Pcon_Sth', 'PM_Co']  # Add the names of cost centres to exclude

# Filter dataframes to exclude problematic divisions and cost centres
filtered_dataframes = []
for df in dataframes:
    if df['Division'].iloc[0].lower() not in excluded_divisions and df['Cost_Centre'].iloc[0] not in excluded_cost_centres:
        filtered_dataframes.append(df)

In [15]:
len(filtered_dataframes)

54

In [16]:
combined_df = pd.concat(filtered_dataframes, ignore_index=True)

# Save the combined dataframe to a CSV file
combined_csv_path = combined_folder + 'combined_file_1.csv'
combined_df.to_csv(combined_csv_path, index=True)

excluded_dataframes = [df for df in dataframes if df['Division'].iloc[0] in excluded_divisions or df['Cost_Centre'].iloc[0] in excluded_cost_centres]
for df in excluded_dataframes:
    print(f"Excluded Division: {df['Division'].iloc[0]}, Cost Centre: {df['Cost_Centre'].iloc[0]}")

Excluded Division: Cost management, Cost Centre: CM_alinea
Excluded Division: Infrastructure, Cost Centre: INF_Def_Nth
Excluded Division: Infrastructure, Cost Centre: INF_Def_SE
Excluded Division: Infrastructure, Cost Centre: INF_T&U_SW
Excluded Division: Infrastructure, Cost Centre: INF_Def_CM_SW
Excluded Division: Infrastructure, Cost Centre: INF_Def_P3M_SW
Excluded Division: Infrastructure, Cost Centre: INF_Digital
Excluded Division: Project management, Cost Centre: PM_Pcon_Sth
Excluded Division: Project management, Cost Centre: PM_RE_Digital
Excluded Division: Project management, Cost Centre: PM_Co
