In [1]:
import pandas as pd
import glob
import numpy as np

from collections import defaultdict

# Datasets

In [20]:
# create_df_list: Datasets aus gegebenen Path laden und in Liste schreiben


def create_df_list(path):
    
    # get a list of all CSV files in the directory
    files = glob.glob(path + '/*.csv')

    # create an empty list to store the DataFrames
    df_list = []

    # iterate over the list of files
    for file in files:
        # read in the CSV file using pandas
        df = pd.read_csv(file)
        # append the DataFrame to the list
        df_list.append(df)

    # returns the list of dataframes
    return df_list

# Programm

## FD Finder

In [6]:
# fastFD: generiert Kandidaten
# gibt ein-elementige rechte Seite mit Attributen aus Liste und alle linken Seiten die min "numberofAttributs"-Elementig

def fastFD(df, lenght_lhs, null_attributes):
    """
    Finds functional dependencies in a DataFrame using the FastFD algorithm.
    """
    # Step 1: Initialize the set of FDs with the trivial ones
    fds = set([(frozenset([attr]), frozenset([])) for attr in df.columns])
    non_trivial_fds = set()

    # Step 2: Compute the equivalence classes of the tuples
    eq_classes = compute_equivalence_classes(df)

    # Step 3: Compute the closure of each attribute set
    closures = {}
    for attr in df.columns:
        closures[frozenset([attr])] = closure(set([attr]), fds)

    # Step 4: Initialize the set of candidate pairs
    candidate_pairs = set([(frozenset([A]), frozenset([B])) for A in df.columns for B in list(null_attributes) if A != B])

    # Step 5: Repeat until there are no more FDs to be found
    while len(candidate_pairs) > 0:
        # Step 5a: Choose a pair of attributes (A, B) such that A -> B is not already known
        (A, B) = candidate_pairs.pop()

        # Step 5b: Compute A+
        AB_union = A.union(B)
        AB_closure = closures[A].intersection(closures[B])
        for attr in df.columns:
            AB_attr = AB_union.union(set([attr]))
            if AB_attr not in closures:
                closures[AB_attr] = closure(AB_attr, fds)

        # Step 5c: If B is in A+, add A -> B to the set of known FDs
        if B not in AB_closure:
            fds.add((A, B))
            non_trivial_fds.add((A, B))

            # Update the closures of all supersets of A
            for attr_set in [s for s in closures.keys() if A.issubset(s) and s != A]:
                closures[attr_set] = closures[attr_set].intersection(AB_closure)

            # Update the set of candidate pairs
            for C in null_attributes:
                if C not in AB_union:
                    candidate_pairs.add((AB_union, frozenset([C])))

    # Step 6: Return the set of known FDs
    result = [(list(X), list(Y)) for (X, Y) in fds if Y and len(X) <= lenght_lhs]
    return result

def compute_equivalence_classes(df):
    """
    Computes the equivalence classes of the tuples in the DataFrame.
    """
    eq_classes = defaultdict(set)
    for row in df.itertuples(index=False):
        eq_classes[row] = set(row)
    return eq_classes

def closure(X, fds):
    """
    Computes the closure of a set of attributes X given a set of FDs.
    """
    X_closure = set(X)
    while True:
        changed = False
        for (A, B) in fds:
            if A.issubset(X_closure) and not B.issubset(X_closure):
                X_closure = X_closure.union(B)
                changed = True
        if not changed:
            break
    return X_closure

In [7]:
# verify_FDs: verifiziert Kandidaten

def verify_FDs(df, candidates):
    fds = []
    for lhs, rhs in candidates:
        if (df.groupby(list(lhs))[list(rhs)].nunique().eq(1).all() == True).all():
            fds.append((set(lhs), set(rhs)))

    # Remove any candidate pairs where there is another candidate pair with the same RHS and a larger LHS
    pruned_fds = []
    for lhs1, rhs1 in fds:
        is_superset = False
        for lhs2, rhs2 in fds:
            if lhs1.issuperset(lhs2) and rhs1 == rhs2 and lhs1 != lhs2:
                is_superset = True
                break
        if not is_superset:
            pruned_fds.append((lhs1, rhs1))

    return pruned_fds

In [21]:
# find_FDs: gibt zu Dataframe FDs zurück

def find_FDs(df):
    null_attributes = df.columns[df.isnull().any()].tolist()
    candidates = fastFD(df, 4, null_attributes)
    verified_fds = verify_FDs(df, candidates)
    if verified_fds != []:
        return verified_fds
    else:
        return []

## Null Replacer

In [9]:
# replace_null: Ersetzt Null-Value

def replace_Null(df, fds):
    for lhs, rhs in fds:
        rhs_copy = rhs.copy()
        lhs_copy = lhs.copy()
        if len(rhs_copy) > 0:
            rhs_col = rhs_copy.pop()
            lhs_col = lhs_copy.pop()

            # Step 1: Find index of row with specific rhs value
            idx = df.index[df[rhs_col].isnull()]

            # Step 2: Get corresponding lhs value(s)
            lhs_values = df.loc[idx, lhs_col].tolist()

            # Step 3: Group DataFrame by lhs
            grouped = df.groupby(lhs_col)

            # Step 4: Access group of rows with same lhs value as idx
            for lhs_value in lhs_values:
                if pd.isna(lhs_value): # is None:
                    continue
                else:
                    group = grouped.get_group(lhs_value)

                # Step 5: Get corresponding rhs value(s) of rows in group
                rhs_values = group[rhs_col].tolist()

                # Step 6: Get value from the Group with the same lhs
                for v in rhs_values:
                    if pd.isna(v):
                        continue
                    else:
                        value = v
                        for i in idx:
                            if df.loc[i, lhs_col] == lhs_value:
                                df.loc[i, rhs_col] = value
                            else:
                                continue 
                    break
    return df

In [16]:
# replacer: bestimmt unteranderem Replacement-Rate

def replacer(df):
    fds = find_FDs(df)
    null_befor = df.isnull().sum().sum()
    null_after_overall = null_befor

    if fds == []:
        return 'There are no Functional Dependencies.'
    else:
        replace_Null(df, fds)
        null_after = df.isnull().sum().sum()
        null_after_overall -= (null_befor-null_after)
        if null_befor == 0:
            replace_rate = 0
        else:
            replace_rate = round((1-(null_after / null_befor)) * 100, 2)
    
    return df, replace_rate, null_befor, null_after

## Main

In [11]:
# main: ersetzt andere Null-Values durch NaN

def main(df, null_values):
   df.replace(null_values, np.nan, inplace=True)
   if df.isnull().any().any():
      replacer2 = replacer(df)
      return replacer2
   else:
      return "There are no NULL-Values"

In [None]:
# dataset_iterator: iterriert durch Datensets und gibt finale Ausgabe zurück

def dataset_iterator(df_list, null_values):
    replace_values = []
    for i in range(len(df_list)):
        df = df_list[i]
        df_copy = df.copy()
        rate = main(df_copy, null_values)
        if isinstance(rate, str):
            replace_values.append([i, rate])
        else:
            replace_values.append([i, rate[1], rate[2], rate[3]])
    return(len(df_list), replace_values)


null_values = ['None', '--', 'NaN', 'Null', 'NA', 'undefined', 'Inf', 'inf', 'NULL']
df_list = create_df_list("C:/Users/ilove/Downloads/result_null_filtered/result_nan_filtered/Used Datasets/test")
dataset_iterator(df_list, null_values)

# Evaluation

In [None]:
import time

null_values = ['None', '--', 'NaN', 'Null', 'NA', 'undefined', 'Inf', 'inf', 'NULL']

start_time = time.time()

# df_list = greate_df_list("C:/Users/ilove/Downloads/result_null_filtered/result_nan_filtered/Neuer Ordner")
# evaluation = dataset_iterator(df_list, null_values, 3)

rate_list = []
counter_null = 0
counter_FD = 0
counter_rate = 0
for j in range(len(evaluation[1])):
    if len(evaluation[1][j]) == 2:
        if evaluation[1][j][1] == 'There are no NULL-Values':
            counter_null += 1
        else:
            counter_FD += 1
    else:
        counter_rate += 1
        rate_list.append(evaluation[1][j])

end_time = time.time()

running_time = end_time - start_time

count = [evaluation[0], counter_rate, counter_null, counter_FD, running_time]

df_rate = pd.DataFrame(rate_list, columns=['ID', 'replacement rate', 'Null befor', 'Null after'])
df_rate["Null difference"] = df_rate["Null befor"] - df_rate["Null after"]
#df_rate.to_csv('evaluation_Dataset.csv', index=False)