In [3]:
import os
import polars as pl
import re
pl.Config.set_tbl_rows(100)  

csv_raw_path = "../../data/csv_raw"
csv_no_dots = "../../data/csv_no_dots"
csv_cleaned_path ="../../data/csv_cleaned"

In [8]:
def get_perfect_match_columns(csv_folder_path: str):
    counts = {}

    csv_filenames = os.listdir(csv_folder_path)
    for filename in csv_filenames:
        # Polar can infer types but as the columns may not be clean we force polar to read every column as string
        df = pl.read_csv(f"{csv_folder_path}/{filename}", infer_schema=False)

        for col in df.columns:
            counts.setdefault(col, [])
            counts[col].append(filename)
    
    length = len(csv_filenames)
    return [col for col, files in counts.items() if len(files) == length]
            
def show_columns(csv_folder_path: str):
    counts = {}

    csv_filenames = os.listdir(csv_folder_path)
    for filename in csv_filenames:
        # Polar can infer types but as the columns may not be clean we force polar to read every column as string
        df = pl.read_csv(f"{csv_folder_path}/{filename}", infer_schema=False)

        for col in df.columns:
            counts.setdefault(col, [])
            counts[col].append(filename)
    
    length = len(csv_filenames)
    print("Perfect match: ")
    for col, files in counts.items():
        if len(files) == length:
            print(f" - {col}")

    print("\nOnly partial match")
    for col, files in counts.items():
        if len(files) != length:
            print(f" - {col}: {len(files)}")


show_columns(csv_cleaned_path)


Perfect match: 
 - nom_rue
 - nom_rue_htr_corr
 - nom_rue_norm
 - no_maison
 - chef_prenom
 - chef_prenom_htr_corr
 - chef_prenom_norm
 - chef_nom
 - chef_nom_htr_corr
 - chef_nom_norm
 - epouse_nom
 - epouse_nom_htr_corr
 - epouse_nom_norm
 - enfants_chez_parents_prenom
 - enfants_chez_parents_prenom_htr_corr
 - enfants_chez_parents_prenom_norm
 - chef_origine
 - chef_origine_htr_corr
 - chef_origine_norm
 - chef_vocation
 - chef_vocation_htr_corr
 - chef_vocation_norm
 - observations
 - Page

Only partial match
 - proprietaire_nom: 59
 - proprietaire_nom_corr: 57
 - proprietaire_nom_htr_corr: 59
 - proprietaire_nom_norm: 59
 - chef_nom_corr: 69
 - chef_annee_naissance: 59
 - epouse_nom_corr: 69
 - epouse_annee_naissance: 59
 - enfants_annee_naissance: 52
 - chef_origine_corr: 69
 - chef_annee_arrivee: 22
 - chef_vocation_top_terms: 71
 - chef_recepisse: 52
 - pensionnaires_prenom: 65
 - pensionnaires_prenom_htr_corr: 65
 - pensionnaires_prenom_norm: 65
 - pensionnaires_nom: 65
 - pen

In [5]:
def replace_entries_with_placeholder(df: pl.DataFrame, column_name: str, pattern: str) -> pl.DataFrame:
    return df.with_columns(df[column_name].cast(pl.Utf8).str.strip_chars().str.replace_all(pattern, "").replace("",None))

def clean_non_numeric_entries(df: pl.DataFrame, column_name: str) -> pl.DataFrame:
    return replace_entries_with_placeholder(df, column_name, r"[^0-9.]")

def clean_dots(df:pl.DataFrame, column_name:str) -> pl.DataFrame:
    return replace_entries_with_placeholder(df, column_name, r"[·]")

def separated_with(
        df: pl.DataFrame, column_name: str, separators: list[str]
    ) -> pl.DataFrame:

        # Create a regex pattern to match any of the given separators
        pattern = "|".join(map(re.escape, separators))

        df = df.with_columns(
            df[column_name]
            .cast(pl.Utf8)  
            .str.replace_all(rf"\s*({pattern})\s*", "|") 
            .str.split("|")  # Split into lists
            .list.eval(pl.element().str.strip_chars()) 
        )
        return df

In [None]:
csv_filenames = os.listdir(csv_cleaned_path)

childrens = ["enfants_chez_parents_prenom",
 "enfants_chez_parents_prenom_htr_corr",
 "enfants_chez_parents_prenom_norm"]

dfs = []

for filename in csv_filenames:
    file_path = os.path.join(csv_cleaned_path, filename)
    try:
        df = pl.read_csv(file_path, infer_schema=False)
        for col in  childrens:
            df = separated_with(df, col, ["|"])
        dfs.append(df)
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        


shape: (3_939,)
Series: 'enfants_chez_parents_prenom' [list[str]]
[
	["anais allax"]
	["emile", "louis"]
	["isalette"]
	["marie"]
	["auguste"]
	null
	["edouard", "willem"]
	["francois"]
	["rosine"]
	["elise"]
	["emma"]
	["edouard", "blanche", "emma"]
	["elise", "charlotte"]
	null
	["eugene", "iraac"]
	["constance", "francois", "samuel"]
	null
	["marguerite"]
	null
	["francois", "louis"]
	["louisa"]
	["marie", "louise", "louis"]
	["francois", "jaques"]
	["louis"]
	null
	["jaques"]
	["marie"]
	["david"]
	["charles"]
	["charles"]
	["cecile", "david"]
	["elisa", "osear"]
	["jean", "louis", "henri"]
	["louis", "eugene"]
	["gustave", "charles", "caroline"]
	null
	null
	null
	null
	["maurice", "emile", "henriette"]
	["louis", "fanny", "emma"]
	["charles", "armand"]
	["marius", "helene", … "auguste"]
	["sophie", "fanny", "emma"]
	["blanche"]
	["elise", "mary"]
	["mathilde"]
	["louise"]
	["sophie", "charles", "philippine"]
	["eugenie", "paul", "elise"]
	…
	null
	null
	null
	null
	null
	null
	nu