In [1]:
import os
import pandas as pd
DATA_PATH = os.getenv("DATA_PATH")

In [2]:
def list_and_compare_files(path_old, path_new):
    try:
        # lijst met alle nieuwe en oude filenames, check dat het enkel csv's zijn
        filenames_old = [f for f in os.listdir(path_old) if f.endswith('.csv')]
        filenames_new = [f for f in os.listdir(path_new) if f.endswith('.csv')]

        # gelijke filenames
        common_filenames = set(filenames_old).intersection(filenames_new)

        return filenames_old, filenames_new, common_filenames
    except Exception as e:
        return f"An error occurred: {e}"

# oude en nieuwe folders
path_old = os.path.join(DATA_PATH, "old")  
path_new = os.path.join(DATA_PATH, "new")  

# resultaten
filenames_old, filenames_new, common_filenames = list_and_compare_files(path_old, path_new)

In [3]:
import pandas as pd

# dicts voor kolommen in df
data = {'Old Filenames': [], 'New Filenames': []}

# gelijke filenamen toevoegen
for file in sorted(common_filenames):
    data['Old Filenames'].append(file)
    data['New Filenames'].append(file)

# Add remaining files from each list
for file in filenames_old:
    if file not in common_filenames:
        data['Old Filenames'].append(file)
        data['New Filenames'].append(None)

for file in filenames_new:
    if file not in common_filenames:
        data['Old Filenames'].append(None)
        data['New Filenames'].append(file)

# Create a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,Old Filenames,New Filenames
0,Account activiteitscode.csv,Account activiteitscode.csv
1,Account.csv,Account.csv
2,Activiteit vereist contact.csv,Activiteit vereist contact.csv
3,Activiteitscode.csv,Activiteitscode.csv
4,Afspraak betreft contact_cleaned.csv,Afspraak betreft contact_cleaned.csv
5,Afspraak_account_gelinkt_cleaned.csv,Afspraak_account_gelinkt_cleaned.csv
6,CDI mailing.csv,CDI mailing.csv
7,CDI sent email clicks.csv,CDI sent email clicks.csv
8,CDI visits.csv,CDI visits.csv
9,Campagne.csv,Campagne.csv


In [4]:
#inschrijvingen en campagnes zijn tussentijdse files

In [5]:
import pandas as pd
import os

def load_csv(file_path):
    #hardcode delimiters en encodings
    delimiters = [',', ';']  
    encodings = ['utf-8', 'latin1', 'utf-8-sig']  

    for encoding in encodings:
        for delimiter in delimiters:
            try:
                return pd.read_csv(file_path, encoding=encoding, delimiter=delimiter, low_memory=False), None
            except (UnicodeDecodeError, pd.errors.ParserError):
                continue  #probeer volgende combinatie

    return None, "Unable to read file with given encodings and delimiters"


def vergelijkHeaders(oldfile, newfile, path_old, path_new):
    try:
        # laad data via bovenstaande functie
        df1, error1 = load_csv(os.path.join(path_old, oldfile))
        df2, error2 = load_csv(os.path.join(path_new, newfile))

        # Check for loading errors
        if error1 or error2:
            return f"Error loading files: {error1 or ''} {error2 or ''}", None

        # Create lists of headers
        headers1 = list(df1.columns)
        headers2 = list(df2.columns)

        # Check if all headers are the same
        if set(headers1) == set(headers2):
            return True, None 
        else:
            # Create a dictionary for columns in DataFrame
            data = {'Columns in ' + oldfile: [], 'Columns in ' + newfile: []}

            # Add matching headers
            for col in headers1:
                if col in headers2:
                    data['Columns in ' + oldfile].append(col)
                    data['Columns in ' + newfile].append(col)

            # Add unique headers, with None in the other column
            for col in headers1:
                if col not in headers2:
                    data['Columns in ' + oldfile].append(col)
                    data['Columns in ' + newfile].append(None)

            for col in headers2:
                if col not in headers1:
                    data['Columns in ' + oldfile].append(None)
                    data['Columns in ' + newfile].append(col)

            # Create a DataFrame for comparison
            comparison_df = pd.DataFrame(data)

            return False, comparison_df
    except Exception as e:
        return f"An error occurred: {e}", None




In [6]:
def checkHeaders(df, path_old, path_new):
    for row in range(len(df)):
        result, comparison_df = vergelijkHeaders(df.iloc[row,0], df.iloc[row,1], path_old, path_new)

        if not result:
            print(df.iloc[row,0], df.iloc[row,1])
            print("Headers are not equal. Comparison:")
            print(comparison_df)
        # else:
        #     print(df.iloc[row,0], df.iloc[row,1])
        #     print("Headers are equal.")

In [7]:
# df om overal waar overeenkomst is te checken of de headers kloppen
dfCorrect = df[df['Old Filenames'] == df['New Filenames']]
print(dfCorrect.head(20))

# df om overal waar geen overeenkomst is te checken of de headers kloppen
renamedDict = [
  {"old": "Account financiële data.csv", "new": "Financiële data.csv"},
  {"old": "Campagnes.csv", "new": "Campagne.csv"},
  {"old": "cdi_pageviews.csv", "new": "cdi pageviews.csv"},
  {"old": "Inschrijvingen.csv", "new": "Inschrijving.csv"},
  {"old": "Contact functie.csv", "new": "Contac functie.csv"},
  #afspraak betreft account nemen we niet op in de db
  #{"old": "Afspraak_betreft_account_cleaned.csv", "new": "Afspraak betreft account_cleaned.csv"},
  #afspraak alle nemen we ook niet op in de db
  #{"old": "Afspraak_alle.csv", "new": "Afspraak alle.csv"},
]

renamedDf = pd.DataFrame(renamedDict)
print(renamedDf.head(20))


                           Old Filenames                         New Filenames
0            Account activiteitscode.csv           Account activiteitscode.csv
1                            Account.csv                           Account.csv
2         Activiteit vereist contact.csv        Activiteit vereist contact.csv
3                    Activiteitscode.csv                   Activiteitscode.csv
4   Afspraak betreft contact_cleaned.csv  Afspraak betreft contact_cleaned.csv
5   Afspraak_account_gelinkt_cleaned.csv  Afspraak_account_gelinkt_cleaned.csv
6                        CDI mailing.csv                       CDI mailing.csv
7              CDI sent email clicks.csv             CDI sent email clicks.csv
8                         CDI visits.csv                        CDI visits.csv
9                           Campagne.csv                          Campagne.csv
10                           Contact.csv                           Contact.csv
11                           Functie.csv            

In [8]:
checkHeaders(dfCorrect, path_old, path_new)

Inschrijving.csv Inschrijving.csv
Headers are not equal. Comparison:
            Columns in Inschrijving.csv
0     crm_Inschrijving_Aanwezig_Afwezig
1     crm_Inschrijving_Aanwezig_Afwezig
2                 crm_Inschrijving_Bron
3                 crm_Inschrijving_Bron
4         crm_Inschrijving_Contactfiche
5         crm_Inschrijving_Contactfiche
6   crm_Inschrijving_Datum_inschrijving
7   crm_Inschrijving_Datum_inschrijving
8         crm_Inschrijving_Inschrijving
9         crm_Inschrijving_Inschrijving
10   crm_Inschrijving_Facturatie_Bedrag
11   crm_Inschrijving_Facturatie_Bedrag
12                                 None
13            crm_Inschrijving_Campagne
14                                 None
15      crm_Inschrijving_Campagne_Naam_


In [9]:
checkHeaders(renamedDf, path_old, path_new)

cdi_pageviews.csv cdi pageviews.csv
Headers are not equal. Comparison:
           Columns in cdi_pageviews.csv        Columns in cdi pageviews.csv
0   crm CDI_PageView[Anonymous Visitor]                                None
1             crm CDI_PageView[Browser]                                None
2            crm CDI_PageView[Campaign]                                None
3             crm CDI_PageView[Contact]                                None
4            crm CDI_PageView[Duration]                                None
5    crm CDI_PageView[Operating System]                                None
6           crm CDI_PageView[Page View]                                None
7       crm CDI_PageView[Referrer Type]                                None
8                crm CDI_PageView[Time]                                None
9          crm CDI_PageView[Page Title]                                None
10               crm CDI_PageView[Type]                                None
11               