# Introduction
When making an analysis for a client it should be clear which data was added or modified in a DataFrame.

Example:

For convenience you might want to replace anything that should be a null (like "-" or "?" or "No value" etc.) with a true null (np.nan) for a Pandas' DataFrame. You also might want to change the names of countries or regions in order to make them compatible with other databases. You might also add missing countries.


With this module you can show which columns are new and if you wish so get the original values back in the original columns.

In [1]:
import pandas as pd
import numpy as np
from collections import OrderedDict

# Function

In [2]:
def multi_index_export(new_df,
                       initial_columns, 
                       initial_df = None,
                       init_name = 'Originale Spalten',
                       new_name = 'Tripicchio Spalten'):
    """Returns a copy of a DataFrame with hierarchical columns in order to show which columns are new.
    You can also undo modifications in the original columns by passing a DataFrame with the original
    values in the argument initial_df.
    
        Args:
            df: DataFrame to export
            initial_df: use this to join initial columns with unchanged values to new columns
    
        Instructions:
            Use args df and to_join with a copy of the original df in order to put the original values
            back into the new DataFrame.
            
            Save to .xlsx or to a file type that is capable of displaying hierarchical columns!

            
        Usage:
            original_df = df.copy(deep = True)
            initial_columns = original_df.columns # if you want to rename columns do it before this line
            df_export = multi_index_export(original_df,initial_columns,to_join = original_df)
    
    """
    df_export = new_df.copy(deep = True)

    
    if initial_df is not None:
        if (df_export.index == initial_df.index).all():
            df_export[initial_columns] = initial_df[initial_columns]
        else:
            raise ValueError('The index differs from the one of the original DataFrame!')
    
    new_columns = set(df_export.columns) - set(initial_columns)
    print('New columns:',new_columns)

    # Getting the position of the columns and making a dict: {col_position:(name_top_col,name_sub_col)}
    ## old columns
    positions = {df_export.columns.get_loc(col):(init_name,col) for col in initial_columns}
    ## new columns
    positions.update({df_export.columns.get_loc(col):(new_name,col) for col in new_columns})
    
    # Sorting the dict by the position of the columns
    positions = OrderedDict(sorted(positions.items()))

    # Creating the MultiIndex with the dict values and assigning it to the df  
    new_columns = pd.MultiIndex.from_tuples(positions.values())
    df_export.columns = new_columns
    
    # Reordering the df by selecting both upper levels in the order needed
    df_export = df_export[[init_name,new_name]]
    
    return df_export

# Usage
Turn the cell below to code to test the function.

# Creating a test DataFrame
data = {'Region':['BW','Bayern','Nordrhein-Westfalen','Florida'],
             'Ort':['Freiburg','Köln','Nürnberg','New York City'],
             'PLZ':['71264','50677','90402',np.nan],
             'ID':[1212,121,323,32]}

df_test = pd.DataFrame(data).set_index('ID')

# Making a copy to save original values and listing the original columns
df_test_original = df_test.copy(deep = True)
initial_columns = df_test_original.columns

# Adding my own data -> this should go under "Tripicchio Spalten"
df_test['Region (harmonized)'] = ['Baden-Württemberg','Bayern','Nordrhein-Westfalen','Florida']
df_test['Region (corrected)'] = ['Baden-Württemberg','Nordrhein-Westfalen','Bayern','New York']
df_test['Country'] = ['Deutschland' for i in range(3)]+['Vereinigte Staaten']        
            
# Modifying original data to see if the modification gets erased 
df_test.loc[1212,'Ort'] = 'Test-Test'
        
df_test_export = multi_index_export(df_test,
                                    initial_columns = initial_columns, 
                                    initial_df = df_test_original)
df_test_export