<a href="https://colab.research.google.com/github/SamBrudell/report.clean/blob/V1/0.3.0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
drive.mount('/content/drive')  # Will prompt for authentication

# Google Drive file ID (Replace with your actual file ID)
file_id = '1pR8Jp7zOGV5OmWVa_5VuFBQlppK15Hi4'  # Replace with your actual file ID
url = f'https://drive.google.com/uc?id={file_id}'

# Download file from Google Drive
output_file_path = 'downloaded_file.csv'
gdown.download(url, output_file_path, quiet=False)
print(f"File downloaded to: {output_file_path}")

# Read the CSV file with encoding handling
try:
    df = pd.read_csv(output_file_path, encoding='ISO-8859-1')  # Try ISO-8859-1 encoding first
    print("File read successfully!")
except UnicodeDecodeError:
    print(f"Error reading the file with 'ISO-8859-1' encoding, trying 'utf-16' encoding...")
    df = pd.read_csv(output_file_path, encoding='utf-16')

# Check if the 'Cost Centre' column exists
if 'Cost Centre' not in df.columns:
    print(f"Error: Column 'Cost Centre' not found in the dataset!")
    print("Available columns:", df.columns.tolist())  # Debugging step
else:
    print(f"'Cost Centre' column exists! Proceeding with splitting...")

# List of columns to delete (if they exist)
columns_to_delete = [
    'Entity', 'GL Date', 'Intercompany', 'Source', 'Category', 'Event Class',
    'Journal Batch', 'Journal', 'Currency', 'Currency Amount', 'Total Value'
]
df = df.drop(columns=[col for col in columns_to_delete if col in df.columns], errors='ignore')

# Function to extract the numeric value from a string (text format)
def extract_numeric_value(text):
    match = re.search(r'(\d+\.?\d*)', str(text))
    return float(match.group(1)) if match else None

# If 'Account' column exists, extract numeric values and keep specific numbers
if 'Account' in df.columns:
    df['Account_numeric'] = df['Account'].apply(extract_numeric_value)

    # List of specific numbers to keep
    keep_numbers = [
        1518, 2410, 2646, 2689, 2681, 2682, 2341, 3092, 2388, 2322, 2644, 2412,
        2391, 2841, 2688, 2852, 2385, 2321, 2384, 2352, 2683, 2735, 2405, 2377,
        2374, 2362, 2823, 2685, 2643, 2373, 2733, 2351, 2371, 3064, 2342, 2379,
        2731, 2851, 2762, 2642, 2411, 2425, 2361, 2858, 2418, 2842, 2421, 2684,
        2737, 2736, 2381, 2413, 2734, 2952, 2822, 2378, 2380, 2430, 2395, 1519,
        2404
    ]

    # Keep only rows where 'Account_numeric' is in keep_numbers
    df = df[df['Account_numeric'].isin(keep_numbers)]

    # Drop temporary column
    df.drop(columns=['Account_numeric'], inplace=True)

# Column to split data by (Update if necessary)
column_name = "Cost Centre"  # Make sure this matches exactly

# Ensure column exists before proceeding
if column_name not in df.columns:
    print(f"Error: Column '{column_name}' not found in the dataset!")
else:
    # Function to sanitize sheet names (remove invalid characters)
    def sanitize_sheet_name(name):
        # Replace any invalid characters with an underscore (_)
        return re.sub(r'[\\/:*?"<>|]', '_', str(name))[:31]  # Max length for sheet name is 31 characters

    # Save filtered data into separate Excel sheets
    output_excel_path = "/content/drive/My Drive/split_data.xlsx"  # Save to Google Drive

    with pd.ExcelWriter(output_excel_path, engine="xlsxwriter") as writer:
        for value, subset in df.groupby(column_name):
            # Sanitize sheet name before writing
            sanitized_value = sanitize_sheet_name(value)
            subset.to_excel(writer, sheet_name=sanitized_value, index=False)  # Ensure sheet name is valid

    print(f"Excel file '{output_excel_path}' created successfully!")

# Save final filtered CSV file to Google Drive (Do not use ExcelWriter for CSV)
output_csv_path_filtered = "/content/drive/My Drive/filtered_file.csv"
df.to_csv(output_csv_path_filtered, index=False)
print(f"Filtered CSV file saved to Google Drive at: {output_csv_path_filtered}")