In [1]:
# import statements
import pandas as pd
import numpy as np
import time

In [2]:
def clean_report(source_data,report,report_filename):
    '''Reads in billing report for month, and source data of division and department,
    then cleans it
    Input : 
    source_data(string) : data showing division and department
    report(string) : billing report from imagic
    report_filename(string) : name of cleaned output file
    
    Output :
    clean_report : cleaned billing month report for month
    
    '''
    # Read in billing report & source_data
    source_data = pd.read_excel(source_data,index_col=None)
    report = pd.read_excel(report, skiprows = 7,index_col=None)
    
    #remove unnamed columns in report
    report = report[report.columns[~report.columns.str.contains('Unnamed:')]]
    
    #list of unused columns
    unused_cols = ['%','Avg Cost','Cost_2','Difference','% Difference','%.1','Avg Duration']
    
    #drop  unused columns
    report.drop(columns= unused_cols,inplace=True)
    
    
    # Create New columns from Extension by striping extension from name
    report[['Number','Name']] = report['Extension'].str.extract(r'(\d+)\s*(.*)', expand=True)
    
    # drop Extension column
    report.drop(['Extension'],axis= 1,inplace=True)
    
    # rename Number to Extension
    report.rename(columns = {'Number':'Extension'}, inplace = True)
                       
    # fill null values in extension with zero
    report['Extension'] = report['Extension'].fillna(0)
                   
    # convert Extension column to appropriate datatype
    convert_dict = {'Extension': int} 
    report = report.astype(convert_dict) 
                   
    #Rename telephoneNumber as Extension to match report.
    source_data.rename(columns = {'telephoneNumber':'Extension'}, inplace = True)
                   
    # join reports df with the source_data df on the Extension
    merged_df = pd.merge(report, source_data, on ='Extension',how ='inner') 
                   
    # drop unused columns
    merged_df.drop(['Present Functional Name'],axis= 1,inplace=True)
                   
    # Change order of columns in dataframe
    report_clean = merged_df[['Extension','Name','Division','Department','Duration','Calls','Cost']]
                   
    # drop duplicate rows using extension
    report_clean = report_clean.drop_duplicates(subset='Extension', keep="first")
    
    # Output clean df
    report_clean.to_excel(report_filename + ".xlsx",sheet_name='main',index = False)
    
    # output null rows to dataframe with index
    null_values = report_clean[report_clean['Division'].isna()]
    null_values.to_excel(report_filename+ "_null_values.xlsx",sheet_name='null_values',index = False) 
                   

In [None]:
start = time.time()

# Run october billing report
#clean_report('source_data.xlsx','sept_2020.xls','Tucn_sept_2020');
#clean_report('source_data.xlsx','oct_2020_desk.xls','Tucn_oct_2020');
#clean_report('source_data.xlsx','nov_2020.xls','Tucn_nov_2020');
#clean_report('source_data.xlsx','dec_2020.xls','Tucn_dec_2020');

end = time.time()
print(end - start)


In [12]:
def clean_gsm(mtn,glo,report_filename):
    '''Reads in gsm billing reports for month, and source data of division and department,
    then cleans it
    Input : 
    mtn(string) : billing report path for mtn
    glo(string) : billing report path for glo
    
    Output :
    clean_report : cleaned & combined gsm billing month report for month
    
    '''
    # Read in billing reports
    mtn_bills = pd.read_excel(mtn,index_col=None)
    
    # read and merge glo excel sheets files
    workbook_url = pd.ExcelFile(glo)
    glo_bills = pd.concat(pd.read_excel(workbook_url, sheet_name=None), ignore_index=True)
    
    #remove unnamed columns in report
    mtn_bills = mtn_bills[mtn_bills.columns[~mtn_bills.columns.str.contains('Unnamed:')]]
    
    # list of mtn cols
    cols = list(mtn_bills.columns)
    
    # add new columns to match 
    glo_bills = glo_bills.reindex(columns=[*glo_bills.columns.tolist(), 'Name', 'Package', 'Previous Balance', 'Payment',
                                           'Adjustment and Discounts','Data Usage','Amount Due (NGN)'], fill_value=0)
    
    
    # create national calls col
    sum_column = glo_bills["CALLS_TO_GLO_GSM"] + glo_bills["CALLS_TO_OTHER_GSM_NETWORKS"] + glo_bills["CALLS_TO_GLO_BROAD_ACCESS"]
    glo_bills["National Calls/SMS"] = sum_column
    
    # renaming cols to match mtn
    glo_bills.rename(columns = {'MONTHLY_ACCESS':'Subscription and VAS','MSISDN':'Service ID',
                                'INTERNATIONAL_CALL':'International Call/SMS','ROAMING_USAGE':'Roaming Calls/SMS',
                                'TOTAL':'Total Charge'}, inplace = True)
    
    # Change order of columns in glo dataframe to match mtn format
    glo_bills = glo_bills[cols]
    
    # Merge gsm reports
    gsm = pd.concat([mtn_bills, glo_bills])
    
    # Output clean df
    gsm.to_excel(report_filename + ".xlsx",sheet_name='gsm',index = False)
    

In [13]:
# clean gsm oct
clean_gsm('mtn_oct.xlsx','glo_oct.xlsx','Gsm_oct_2020')
# clean gsm dec
clean_gsm('mtn_dec.xlsx','glo_dec.xlsx','Gsm_dec_2020')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


