In [1]:
import pandas as pd
import numpy as np
import polars as pl
import os
pd.set_option('display.max_columns',None)

#### If We Have Multiple Master File Then First Append Them By Asking Path of Each

In [2]:
#### If We Have Multiple Master File Then First Append Them By Asking Path of Each
def append_files(n: int):
    collect_df = []
    for i in range(n):
        while True:
            _format = input('Enter the File Format (.excel/.csv): ').lower().strip()

            if not _format:
                print('Format Input Cannot Be Blank')
            elif _format not in ['.csv', '.excel']:
                print('Invalid File Format Provided, please choose (.excel/.csv)')
            else:
                file_path = input('Enter the Absolute Path Of the File: ').strip()

                if not file_path:
                    print("File Path Cannot Be Empty")
                else:
                    try:
                        if _format == '.csv':
                            df=pd.read_csv(file_path)
                        elif _format == '.excel':
                            df=pd.read_excel(file_path)

                        # Assuming `collect_df` is a list to store DataFrames
                        collect_df.append(df)
                        break
                    except Exception as e:
                        print(f'{e} Error Raised While Opening File')
            
                
                
            

    if collect_df:
        master_df = pd.DataFrame(columns=collect_df[0].columns)
        for df in collect_df:
            master_df = pd.concat([master_df, df], axis=0)
        print("All the Data Is Appended In Master DF: ")
        return master_df
    else:
        print('No Valid Files were Provided. ')

#### Here We Are Now taking input of our mapping column from Users

In [3]:
#### Here We Are Now taking input of our mapping column from Users
def creating_mapping(df,mapped_columns):
    columns_we_want=[]
    mapping_dict={}
    print(f'We Will Be Creating Mapping For {mapped_columns.keys()}, Enter None If We Dont Have A Mapping Column For A Specific Column')
    for column in mapped_columns:
        #   ----------------------------if column name is description type--------------------------------
        if column=='Description':
            while True:
                while True:
                    invalid=[]
                    value=input(f'Enter the mapping column for {column}:  ')
                        
                    values=[col.strip() for col in value.split(',')]
                    for value in values:
                        if value not in df.columns:
                            invalid.append(value)
                    if len(invalid)!=0:
                        print(f'{invalid} not present in master file')
                        print(f'Choose From {df.columns}')
                    else:
                        break
                mapping_dict[column]=values
                columns_we_want.append(values)
                break

    #   --------------------------------if column name is not description type--------------------------------      
        else:
            while True:
                value=input(f'Enter the mapping column for {column},(None if no mapping column): ')
                if not value.strip():
                    print('Column Name Cannot Be Empty')
                    print(f'Choose from {df.columns}')
                else:
                    if value.lower().strip()=='none':
                        mapping_dict[column]=None
                        break
                    
                    elif value not in df.columns:
                        print(f'{value} not present in Master file')
                        print(f'Choose From {df.columns}')

                    else:
                        mapping_dict[column]=value
                        columns_we_want.append(value)
                        break

                    
    cols_we_want=[]
    for column in columns_we_want:
        if isinstance(column,list):
            for col in column:
                cols_we_want.append(col)
        else:
            cols_we_want.append(column)
        
        
    return mapping_dict,cols_we_want

#### Beginning the analysis of the columns

In [4]:
#### Beginning the analysis of the columns
def begin_analysis(mapping_dict,cols_we_want,df,total_transactions):
    result={}
    for key in mapping_dict:
        # analysis of the description columns-------------->
        if key=='Description':
            desc_cols=mapping_dict[key]
            for column in  desc_cols:
                nas=pd.to_numeric(df[column],errors='coerce').notnull().sum()
                percentage=((total_transactions-nas)/total_transactions)*100
                result[column]={'Percentage Population':percentage,'NA Count':nas,'Comment':None,
                               'Column Type':'Important'}

        #: analysis of non description type columns--------------->
        else:
            value=mapping_dict[key]
            if value is None:
                result[key]={'Percentage Population':None,'NA Count':None,'Comment':None,
                               'Column Type':'Important'}
            else:
                value=mapping_dict[key]
                nas=df[value].isna().sum()
                percentage=((total_transactions-nas)/total_transactions)*100
                result[key]={'Percentage Population':percentage,'NA Count':nas,'Comment':None,
                               'Column Type':'Important'}


#: Analysis of all the Good To have columns
    for column in df.columns:
        if column not in cols_we_want:
            nas=df[column].isna().sum()
            percentage=((total_transactions-nas)/total_transactions)*100
            result[column]={'Percentage Population':percentage,'NA Count':nas,'Comment':None,
                               'Column Type':'Good To Have'}

        
        

    return result

In [5]:
def cleaning_data(df):
    invalids=["#N/A",'N/A','N/A','NA','NULL','NONE','NOT ASSIGNED','NOT AVAILABLE'," "]
    obj_cols=df.select_dtypes(include=['object']).columns
    for col in obj_cols:
        df[col]=df[col].replace(invalids,None)
    return df

In [7]:
def main():
    analyzing_cols = ['Date',
                  "Spend",
                  "Currency",
                  "Invoice Number",
                  "Invoice Line Number",
                  "Supplier Name",
                 'Description']
    mapped_columns={}
    for col in analyzing_cols:
        mapped_columns[col]=None

    # ---------------------------------taking the input of the path-----------------------------------------
    print('taking the input of the files:--->')
    while True:
        try:
            no_files = int(input('Enter the Number of Master File We Have: '))

            if isinstance(no_files,int):
                    break

        except Exception as e:
            print(f'{e} error raised in the input, Integer Value is Expected')



    if no_files == 1:
        while True:
            file_path = input('Enter the file path for master file: ')
            if not file_path.strip():
                print('File Path Cannot Be Empty')
            else:
                
                try:
                    if file_path.strip().endswith('.csv'):
                        df = pl.read_csv(file_path)
                        df=df.to_pandas()
                        break
                        
                    else:
                        df = pl.read_excel(file_path)
                        df=df.to_pandas()
                        break
                        
                except Exception as e:
                    print(f'{e} Error While Opening File File: ')
    else:
        df=append_files(no_files)

    print('--------------------------Initiating the Mapping Process----------------------------------------')               

    mapping_dict,columns_we_want=creating_mapping(df,mapped_columns)
    
    

    print('-------------------------Initiating the analysis-----------------------------------------------')
    print(f'Total Number of Transactions are:  {df.shape[0]}')
    result=begin_analysis(mapping_dict,columns_we_want,df,df.shape[0])
    df2=pd.DataFrame.from_records(result)
    df2=df2.transpose()
    df2.reset_index(inplace=True)
    print(df2.head(20))

    print('-----------------------Analysis End Below Are The Results-----------------------------------')
    while True:
        file_name=input('Enter a file name to save the results in: ->')
        if not file_name.strip():
            print('File Name Cannot Be Empty')
        else:
            folder='ResultsFolder'
            os.makedirs(folder,exist_ok=True)
            file_path=os.path.join(folder,f'{file_name}.csv')
            df2.to_csv(file_path,index=False)
            print('Results has been saved to the folder...........')
            break
    print("Saving the Concated Data in another Files.....")
    
    while True:
        file_name=input('Enter a file name to save the results in: ->')
        if not file_name.strip():
            print('File Name Cannot Be Empty')
        else:
            folder='Appended File Folder'
            os.makedirs(folder,exist_ok=True)
            file_path=os.path.join(folder,f'{file_name}.csv')
            df2=pl.from_pandas(df)
            df2.to_csv(file_path,index=False)
            print('Results has been saved to the folder...........')
            break
        

In [8]:
if __name__=="__main__":
    main()

taking the input of the files:--->


Enter the Number of Master File We Have:  1
Enter the file path for master file:  C:\Users\AnkitS-Simfoni\Users\ankit-Simfoni\Automation Tasks\Task 1 - By Ashish - Finalized\Sunsource Mapping Raw data\Sunsource Invoice_Concur August consolidation.xlsx


--------------------------Initiating the Mapping Process----------------------------------------
We Will Be Creating Mapping For dict_keys(['Date', 'Spend', 'Currency', 'Invoice Number', 'Invoice Line Number', 'Supplier Name', 'Description']), Enter None If We Dont Have A Mapping Column For A Specific Column


Enter the mapping column for Date,(None if no mapping column):  Document Date
Enter the mapping column for Spend,(None if no mapping column):  Spend
Enter the mapping column for Currency,(None if no mapping column):  Document Currency
Enter the mapping column for Invoice Number,(None if no mapping column):  Document Number
Enter the mapping column for Invoice Line Number,(None if no mapping column):  Document Line Number
Enter the mapping column for Supplier Name,(None if no mapping column):  Supplier Name
Enter the mapping column for Description:   .


['.'] not present in master file
Choose From Index(['SrNo', 'ActualSrNo', 'Data Source', 'File Name', 'Source System',
       'Entity Code', 'Entity Name', 'Entity City', 'Entity State',
       'Entity Country', 'Entity Region', 'Document Number',
       'Document Line Number', 'Document Date', 'Document Header Description',
       'Document Line Description', 'Buyer Name', 'Payment Terms Code',
       'Payment Terms Description', 'Supplier Document Number',
       'Supplier Code', 'Supplier Name', 'Supplier Name (Normalized)',
       'Supplier City', 'Supplier State', 'Supplier Country',
       'Supplier Region', 'Supplier Tax ID', 'Document Unit price',
       'Document Quantity', 'Document UOM', 'Document Currency',
       'Amount in Document Currency', 'FX Rate', 'Spend', 'Cost Center Code',
       'Cost Center Description', 'General Ledger Code',
       'General Ledger Description', 'Material Code', 'Material Description',
       'Material Group Code', 'Material Group Description'

Enter the mapping column for Description:   Document Header Description,Document Line Description,Payment Terms Description,Cost Center Description,General Ledger Description,Material Description,Material Group Description,


[''] not present in master file
Choose From Index(['SrNo', 'ActualSrNo', 'Data Source', 'File Name', 'Source System',
       'Entity Code', 'Entity Name', 'Entity City', 'Entity State',
       'Entity Country', 'Entity Region', 'Document Number',
       'Document Line Number', 'Document Date', 'Document Header Description',
       'Document Line Description', 'Buyer Name', 'Payment Terms Code',
       'Payment Terms Description', 'Supplier Document Number',
       'Supplier Code', 'Supplier Name', 'Supplier Name (Normalized)',
       'Supplier City', 'Supplier State', 'Supplier Country',
       'Supplier Region', 'Supplier Tax ID', 'Document Unit price',
       'Document Quantity', 'Document UOM', 'Document Currency',
       'Amount in Document Currency', 'FX Rate', 'Spend', 'Cost Center Code',
       'Cost Center Description', 'General Ledger Code',
       'General Ledger Description', 'Material Code', 'Material Description',
       'Material Group Code', 'Material Group Description',

Enter the mapping column for Description:   Document Header Description,Document Line Description,Payment Terms Description,Cost Center Description,General Ledger Description,Material Description,Material Group Description


-------------------------Initiating the analysis-----------------------------------------------
Total Number of Transactions are:  22774
                          index Percentage Population NA Count Comment  \
0                    ActualSrNo                 100.0        0    None   
1           Addressability Flag                   0.0    22774    None   
2   Amount in Document Currency                 100.0        0    None   
3                      Batch ID                   0.0    22774    None   
4                    Buyer Name             25.950645    16864    None   
5              Category Level 0                   0.0    22774    None   
6              Category Level 1                   0.0    22774    None   
7              Category Level 2                   0.0    22774    None   
8              Category Level 3                   0.0    22774    None   
9              Category Level 4                   0.0    22774    None   
10             Category Level 5                  

Enter a file name to save the results in: -> SunSource Results


Results has been saved to the folder...........
Saving the Concated Data in another Files.....


Enter a file name to save the results in: -> results


AttributeError: 'DataFrame' object has no attribute 'to_csv'