In [None]:
import os
import pandas as pd
import re
from difflib import get_close_matches

In [None]:
months = {
    'vaishaakh':1,
    'jetth':2,
    'jessth':2,
    'asaadh':3,
    'shraavnn':4,
    'bhaadr':5,
    'aashvin':6,
    'asoj':6,
    'kaartik':7,
    'mangshir':8,
    'maarg': 8,
    'pauss':9,
    'maagh':10,
    'phaagun':11,
    'caitr':12
}

In [None]:
fy_dirs = os.listdir('data')
fy_dirs = [fy for fy in fy_dirs if int(fy.split('-')[1]) >2073]
fy_dirs.sort()
fy_dirs

In [None]:
def prepare_file_details(fy_dir):
    base_dir = os.path.join('data', fy_dir)
    files = os.listdir(base_dir)
    file_details =[] 
    for file in files:
        matches = re.match(r'(\d{4})-(\w+).xlsx', file)
        if matches:
            year, month = matches.group(1), matches.group(2)
            month_match = get_close_matches(month, months.keys())
            if month_match:
                month = months[month_match[0]]
            else:
                raise Exception("unable to parse month from title", month)
            file_details.append({
                'path': os.path.join(base_dir, file),
                'year': year,
                'month': month,
                'fy': fy_dir.replace('-', '/')[3:]
            })

    custom_month_order = [4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2]
    file_details = sorted(file_details, key=lambda x: (x['year'], custom_month_order.index(x['month'])))
    return file_details

In [None]:
def get_df(path:str):
    df = pd.DataFrame({'Unnamed':[1,2,3]})
    i = 0
    while df.columns.str.contains('^Unnamed').sum() > 0:
        df = pd.read_excel(path, sheet_name=5, header=i)
        if len(df.columns) == 5:
            df = pd.read_excel(path,sheet_name=4,header=i)
        i+=1
    return df

In [None]:
def filter_rows(df:pd.DataFrame):
    filtered = df[df['hscode'].apply(lambda x: str(x).startswith('87'))]
    return filtered

In [None]:
def merge_monthwise_files(file_details):
    dfs = []
    cumulated_cols = ['quantity', 'value', 'revenue']
    for i, file_info in enumerate(file_details):
        print('Processing file:', file_info['path'])
        df = get_df(file_info['path'])
        df.columns = ['hscode', 'description', 'unit', *cumulated_cols]

        if i > 0:
            prev_df = get_df(file_details[i-1]['path'])
            prev_df.columns = ['hscode', 'description', 'unit', *cumulated_cols]
            df_temp = df.set_index('description')
            prev_df_temp = prev_df.set_index('description')

            # Drop duplicate values from the index
            df_temp = df_temp[~df_temp.index.duplicated(keep='first')]
            prev_df_temp = prev_df_temp[~prev_df_temp.index.duplicated(keep='first')]

            df_temp[cumulated_cols] = df_temp[cumulated_cols].subtract(prev_df_temp[cumulated_cols], fill_value=0)
            df_temp.reset_index(inplace=True)
            
            df = df_temp

        df = filter_rows(df)

        df = df.copy()

        # df['year'] = file_info['year']
        df['month'] = file_info['month']
        df['fy'] = file_info['fy']
        dfs.append(df)
    
    merged_df = pd.concat(dfs, ignore_index=True)
    merged_df.dropna(inplace=True)
    merged_df = merged_df[['fy', 'month', 'hscode', 'description', 'unit', 'quantity', 'value', 'revenue']]
    # merged_df.to_csv(file_details[0]['path'].split('/')[1]+'.csv', index=False)
    return merged_df

In [11]:
def remove_negatives(df:pd.DataFrame):
    df['quantity'] = df['quantity'].apply(lambda x: x if x > -1 else pd.NA)
    df['value'] = df['value'].apply(lambda x: x if x > -1 else pd.NA)
    df['revenue'] = df['revenue'].apply(lambda x: x if x > -1 else pd.NA)

    df.dropna(inplace=True)

    df['value'] = df['value'].apply(lambda x: max(x, 0))
    df['quantity'] = df['quantity'].apply(lambda x: max(x, 0))
    df['revenue'] = df['revenue'].apply(lambda x: max(x, 0))

    return df

In [None]:
yearly_dfs = []
for fy_dir in fy_dirs:
    file_details = prepare_file_details(fy_dir)
    yearly_df = merge_monthwise_files(file_details)    
    yearly_dfs.append(yearly_df)

final_df = pd.concat(yearly_dfs, ignore_index=True)
final_df['hscode'] = final_df['hscode'].apply(lambda x: int(x))
final_df.to_csv('extracted.csv', index=False)

In [12]:
negative_df = pd.read_csv('extracted.csv')
positive_df = remove_negatives(negative_df)
positive_df.to_csv('extracted.csv', index=False)