In [2]:
import os
import pandas as pd
import re
from difflib import get_close_matches

In [3]:
months = {
    'vaishaakh':1,
    'jessth':2,
    'asaadh':3,
    'shraavnn':4,
    'bhaadr':5,
    'aashvin':6,
    'asoj':6,
    'kaartik':7,
    'mangshir':8,
    'maargsmm': 8,
    'pauss':9,
    'maagh':10,
    'phaagun':11,
    'caitr':12
}

In [4]:
base_dir = 'data/fy-2077-78'
files = os.listdir(base_dir)
file_details =[] 
for file in files:
    matches = re.match(r'(\d{4})-(\w+).xlsx', file)
    if matches:
        year, month = matches.group(1), matches.group(2)
        month_match = get_close_matches(month, months.keys())
        if month_match:
            month = months[month_match[0]]
        else:
            raise Exception("unable to parse month from title", month)
        file_details.append({
            'path': os.path.join(base_dir, file),
            'year': year,
            'month': month
        })
file_details = sorted(file_details, key=lambda x: (x['year'], x['month']))

In [5]:
def contains(x:str):
    if type(x) != str:
        return False
    x = x.lower()
    desired = ['electric']
    for word in desired:
        if word in x:
            return True
    return False

In [6]:
def get_df(path:str):
    df = pd.DataFrame({'Unnamed':[1,2,3]})
    i = 0
    while df.columns.str.contains('^Unnamed').sum() > 0:
        df = pd.read_excel(path, sheet_name=5, header=i)
        i+=1
    return df

In [15]:
cumulated_cols = ['Quantity', 'Imports_Value','Imports_Revenue']
dfs = []
for i, file_info in enumerate(file_details):
    df = get_df(file_info['path'])

    if i > 0:
        prev_df = get_df(file_details[i-1]['path'])
        df_temp = df.set_index('Description')
        prev_df_temp = prev_df.set_index('Description')

        # Drop duplicate values from the index
        df_temp = df_temp[~df_temp.index.duplicated(keep='first')]
        prev_df_temp = prev_df_temp[~prev_df_temp.index.duplicated(keep='first')]

        df_temp[cumulated_cols] = df_temp[cumulated_cols].subtract(prev_df_temp[cumulated_cols], fill_value=0)
        df_temp.reset_index(inplace=True)
        df = df_temp

    df = df[df['Description'].apply(contains)]
    df = df.copy()

    df['year'] = file_info['year']
    df['month'] = file_info['month']
    dfs.append(df)

merged_df = pd.concat(dfs, ignore_index=True)
merged_df.to_csv('test.csv', index=False)