In [4]:
import polars as pl
from pathlib import Path

# dfを渡すと、col_nameで重複する要素に番号を順番に付与し、col_nameをユニークに
def suffix_dupes(df, col_name):
    item_df = df.select(col_name)
    # 重複する要素のseriesを作成
    dups_ser = item_df.filter(pl.col(col_name).is_duplicated()).unique().to_series()
    dup_d = {ele:0 for ele in dups_ser}
    # 新しい列を作成するための空のリストを用意
    item_ser = item_df.to_series()
    new_col = []
    for i in item_ser:
        if i in dups_ser: # 要素が重複リストに含まれる場合
            new_i = i + str(dup_d[i]) # 要素の後ろに数字を付ける
            dup_d[i]+= 1
            new_col.append(new_i)
        else: new_col.append(i)
    new_col = pl.Series(col_name, new_col)
    df = df.with_columns(new_col)
    
    return df


# dfを渡すと、col_nameの要素に番号を順番に付与し、col_nameをユニークに
def shuffix_col(df, col_name):
    item_ser = df.select(col_name).to_series()
    item_d = {ele:0 for ele in item_ser}
    # 新しい列を作成するための空のリストを用意
    new_col = []
    for i in item_ser:
        new_i = i + str(item_d[i]) # 要素の後ろに数字を付ける
        item_d[i]+= 1
        new_col.append(new_i)
    new_col = pl.Series(col_name, new_col)
    df = df.with_columns(new_col)
    
    return df


def create_combination_columns(df, column):
    items_l = df.select(column).to_series()
    
    new_df = None
    for i, item in enumerate(items_l):
        item_row = df.select(pl.all().exclude(column))[i]
        new_column_names = [item + "_" + name for name in item_row.columns] # .suffixで代用可?

        item_row.columns = new_column_names
        if new_df is None: new_df = item_row
        else: new_df = new_df.with_columns(item_row)
    
    return new_df


def column_to_feature(df, uq_col, featured_col):
    new_df = None
    id_uq = df[uq_col].unique()

    for id in id_uq:
        df_id = df.filter(pl.col(uq_col)==id)
        df_id = shuffix_col(df_id, featured_col).sort(featured_col)


        col_index = df_id.get_column_index(featured_col)
        df_right = df_id.select(df_id.columns[col_index:])
        df_left = df_id.select(df_id.columns[:col_index])

        df_left_chara = df_left[0]  # 同じデータが並んでいるので、最初の一行だけとってくる
        df_right_chara = create_combination_columns(df_right, featured_col)

        df_rec = pl.concat([df_left_chara, df_right_chara], how="horizontal")
        
        if new_df is None: new_df = df_rec
        else: new_df = pl.concat([new_df, df_rec], how="diagonal")
        
    return new_df





In [None]:
path_ndat = Path("xxx")
path_tdat = Path("xxx")
path_catch = Path("xxx")

# df = pl.read_csv(path, ignore_errors=True)
df_ndat = pl.read_csv(path_ndat, infer_schema_length=1000)
df_tdat = pl.read_csv(path_tdat, infer_schema_length=1000)
df_catch = pl.read_csv(path_catch, infer_schema_length=1000)



df_ndat = column_to_feature(df_ndat, "xxx", "xxx")
df_tdat = column_to_feature(df_tdat, "xxx", "xxx")

df = pl.concat([df_ndat, df_tdat], how="align")


df_total = df_ndat.join(df_tdat, on=df_ndat.columns[:2], how='outer_coalesce')
df_total = df_total.join(df_catch, on=df_ndat.columns[:2], how='outer_coalesce')


df_ndat.write_csv("xxx.csv")
df_tdat.write_csv("xxx.csv")
df_total.write_csv("xxx.csv")


with open("./tmp.txt", "w") as f:
    with pl.Config() as cfg:
        cfg.set_tbl_rows(-1)
        cfg.set_tbl_cols(-1)

        non_null_counts = {col: [df[col].is_not_null().sum()] for col in df.columns}
        non_null_counts = pl.DataFrame(non_null_counts)
        print(non_null_counts, file=f)
        
            
        




