In [8]:
import numpy as np
import pandas as pd

In [9]:
def midpoint(value):
        if isinstance(value, str) and '-' in value:
            try:
                a, b = map(float, value.split('-'))
                return (a + b) / 2
            except ValueError:
                return value
        try:
            return float(value)
        except ValueError:
            return value

def midpoint_column(df, features=['length', 'wspan', 'weight','eggs']):
    df_mid = df.copy()
    for feature in features:
        df_mid[feature] = df_mid[feature].apply(midpoint)
    return df_mid

def add_bmi(df):
    """
    Adds a new column 'bmi' = weight / length^2
    """
    new_df = df.copy()
    new_df['bmi'] = new_df["weight"] / (new_df["length"] ** 2)
    return new_df


def add_wsi(df):
    """
    Adds a new column 'wsi' = wspan / length
    """
    new_df = df.copy()
    new_df['wsi'] = new_df["wspan"] / new_df["length"]
    return new_df

def binary_features(df, features=['diver', 'long-billed', 'webbed-feet', 'long-legs', 'wading-bird', 'plunge-dives', 'sim']):
    df = df.copy()  
    for feature in features:
        if feature in df.columns:
            if feature=='sim':
                 df[feature] = df[feature].str.strip().str.lower().replace({
                'yes': 'similar',
                'no': 'nonsimilar'
            })
                
            df[feature] = df[feature].str.strip().str.lower().replace({
                'yes': feature,
                'no': None
            })
    return df

def numerical_features_std(df, features=['bmi', 'wsi', 'AR', 'wload', 'eggs']):
    df = df.copy()
    
    for feature in features:
        if feature in df.columns:
            mean = df[feature].mean()
            std = df[feature].std()
            high = mean + std
            low = mean - std
        
            df[feature] = df[feature].apply(
                lambda x: f'high-{feature}' if x >= high 
                          else (f'low-{feature}' if x <= low 
                                else None)
            )
    return df

def multivalue_features(df, features=[]):
    df = df.copy()
    for feature in features:
        if feature in df.columns:
            df[feature] = df[feature].str.replace(" ", "", regex=False)
            df[feature] = df[feature].apply(lambda x: f"{x}-{feature}" if x and x.lower() != 'nan' else None)
    return df

def feature_extraction(df):
    df = midpoint_column(df)
    df = add_bmi(df)
    df = add_wsi(df)
    df = df.drop(columns=['weight', 'length', 'wspan','species'])
    df = binary_features(df)
    df = numerical_features_std(df)
    df = multivalue_features(df, features=['belly','back', 'ftype', 'billcol', 'legcol', 'incub', 'ccare'])
    return df

In [10]:
df = pd.read_csv("birds2025ext.csv", sep=';')
df = feature_extraction(df)

def create_transaction_file(df, filename="features.txt"):
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            transaction = []
            for value in row:
                if value != None:
                    value = str(value).replace(" ", "")
                    parts = [v.strip() for v in str(value).split(',') if v.strip()]
                    transaction.extend(parts)
            if transaction:
                f.write(" ".join(transaction) + "\n")
                
create_transaction_file(df, 'features.txt')                
