In [2]:
import os
import pandas as pd
import re
from difflib import get_close_matches

In [67]:
months = {
    'vaishaakh':1,
    'jetth':2,
    'jessth':2,
    'asaadh':3,
    'shraavnn':4,
    'bhaadr':5,
    'aashvin':6,
    'asoj':6,
    'kaartik':7,
    'mangshir':8,
    'maargsmm': 8,
    'pauss':9,
    'maagh':10,
    'phaagun':11,
    'caitr':12
}

In [80]:
base_dir = 'data/fy-2079-80'
files = os.listdir(base_dir)
file_details =[] 
for file in files:
    matches = re.match(r'(\d{4})-(\w+).xlsx', file)
    if matches:
        year, month = matches.group(1), matches.group(2)
        month_match = get_close_matches(month, months.keys())
        if month_match:
            month = months[month_match[0]]
        else:
            raise Exception("unable to parse month from title", month)
        file_details.append({
            'path': os.path.join(base_dir, file),
            'year': year,
            'month': month
        })
file_details = sorted(file_details, key=lambda x: (x['year'], x['month']))

In [81]:
def get_df(path:str):
    df = pd.DataFrame({'Unnamed':[1,2,3]})
    i = 0
    while df.columns.str.contains('^Unnamed').sum() > 0:
        df = pd.read_excel(path, sheet_name=5, header=i)
        i+=1
    return df

In [84]:
def filter_rows(df:pd.DataFrame):
    categories = {
        'motorcycles': '8711',
        'public_vehicle': '8702',
        'cars': '8703',
        'induction': '8516'
    }
    for category, code in categories.items():
        print('-------------------------')
        print(category)
        filtered = df[df[df.columns[0]].apply(lambda x: str(x).startswith(code))]
        print(filtered)

In [85]:
cumulated_cols = ['Quantity', 'Imports_Value','Imports_Revenue']
dfs = []
for i, file_info in enumerate(file_details):
    df = get_df(file_info['path'])

    if i > 0:
        prev_df = get_df(file_details[i-1]['path'])
        df_temp = df.set_index('Description')
        prev_df_temp = prev_df.set_index('Description')

        # Drop duplicate values from the index
        df_temp = df_temp[~df_temp.index.duplicated(keep='first')]
        prev_df_temp = prev_df_temp[~prev_df_temp.index.duplicated(keep='first')]

        df_temp[cumulated_cols] = df_temp[cumulated_cols].subtract(prev_df_temp[cumulated_cols], fill_value=0)
        df_temp.reset_index(inplace=True)
        df = df_temp

    filter_rows(df)

    break
    # df = df[df.apply(contains)]
    df = df.copy()

    df['year'] = file_info['year']
    df['month'] = file_info['month']
    dfs.append(df)

# merged_df = pd.concat(dfs, ignore_index=True)
# merged_df.to_csv('test.csv', index=False)

-------------------------
motorcycles
          HSCode                                        Description Unit  \
3408  87112011.0  Unassembled Motorcycles with piston engine of ...  PCS   
3409  87112012.0  Unassembled Motorcycles with piston engine of ...  PCS   
3410  87112019.0  Unassembled Motorcycles with piston engine of ...  PCS   
3411  87112091.0  Motorcycles with piston engine of capacity exc...  PCS   
3412  87112092.0  Motorcycles with piston engine of capacity exc...  PCS   
3413  87113090.0  Motorcycle wiht piston engine exceeding 250cc ...  PCS   
3414  87116010.0           Cycle with electric motor for propulsion  PCS   
3415  87116090.0  Others (except cycle) motorcycle & scooter wit...  PCS   

      Quantity  Imports_Value  Imports_Revenue  
3408     911.0   7.717600e+04        56951.761  
3409    2324.0   2.512709e+05       186560.128  
3410     675.0   9.092399e+04        93445.283  
3411   13392.0   1.051124e+06      1286230.075  
3412    1382.0   1.778423e+05   