In [25]:
import numpy as np
import pandas as pd

In [None]:
def month_to_num(month_str):
    months = {
        "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
        "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12}
    
    if not isinstance(month_str, str) or month_str.strip() == '':
        return None
    
    parts = [m.strip().lower() for m in month_str.split('-') if m.strip()]
    if len(parts) == 1:
        return months.get(parts[0])
    elif len(parts) == 2:
        a, b = months.get(parts[0]), months.get(parts[1])
        if a and b:
            return (a + b) / 2
    return None


def migration_times(df):
    df = df.copy()

    def arrival_feature(month_num):
        if month_num is None:
            return None
        if month_num <= 3.5:
            return "early-arrival"
        elif 5<= month_num :
            return "late-arrival"
        else:
            return None
    def leaving_feature(month_num):
        if month_num is None:
            return None
        if month_num <= 8:
            return "early-leave"
        elif 10 <= month_num:
            return "late-leave"
        else:
            return None

    df["arrives"] = df["arrives"].apply(month_to_num).apply(arrival_feature)
    df["leaves"] = df["leaves"].apply(month_to_num).apply(leaving_feature)

    return df



def midpoint(value):
        if isinstance(value, str) and '-' in value:
            try:
                a, b = map(float, value.split('-'))
                return (a + b) / 2
            except ValueError:
                return value
        try:
            return float(value)
        except ValueError:
            return value


def midpoint_column(df, features=['length', 'wspan', 'weight','eggs']):
    df_mid = df.copy()
    for feature in features:
        df_mid[feature] = df_mid[feature].apply(midpoint)
    return df_mid

def add_bmi(df):
    """
    Adds a new column 'bmi' = weight / length^2
    """
    new_df = df.copy()
    new_df['bmi'] = new_df["weight"] / (new_df["length"] ** 2)
    return new_df


def add_wsi(df):
    """
    Adds a new column 'wsi' = wspan / length
    """
    new_df = df.copy()
    new_df['wsi'] = new_df["wspan"] / new_df["length"]
    return new_df

def binary_features(df, features=['diver', 'long-billed', 'webbed-feet', 'long-legs', 'wading-bird', 'plunge-dives', 'sim']):
    df = df.copy()  
    for feature in features:
        if feature in df.columns:
            if feature=='sim':
                 df[feature] = df[feature].str.strip().str.lower().replace({
                'yes': 'similar',
                'no': 'nonsimilar'
            })
                
            df[feature] = df[feature].str.strip().str.lower().replace({
                'yes': feature,
                'no': None
            })
    return df

def numerical_features_std(df, features=['bmi', 'wsi', 'AR', 'wload', 'eggs']):
    df = df.copy()
    
    for feature in features:
        if feature in df.columns:
            mean = df[feature].mean()
            std = df[feature].std()
            high = mean + std
            low = mean - std
        
            df[feature] = df[feature].apply(
                lambda x: f'high-{feature}' if x >= high 
                          else (f'low-{feature}' if x <= low 
                                else None)
            )
    return df

def multivalue_features(df, features=[]):
    df = df.copy()
    for feature in features:
        if feature in df.columns:
            df[feature] = df[feature].str.replace(" ", "", regex=False)
            df[feature] = df[feature].apply(lambda x: f"{x}-{feature}" if x and x.lower() != 'nan' else None)
    return df

def feature_extraction(df):
    "This function applies all the other functions to compute the feature extraction."
    df = midpoint_column(df)
    df = add_bmi(df)
    df = add_wsi(df)
    df = df.drop(columns=['weight', 'length', 'wspan','species'])
    df = binary_features(df)
    df = numerical_features_std(df)
    df = multivalue_features(df, features=['belly','back', 'ftype', 'billcol', 'legcol', 'incub', 'ccare'])
    df = migration_times(df)
    return df

In [27]:
df = pd.read_csv("birds2025ext.csv", sep=';')
df = feature_extraction(df)

def create_transaction_file(df, filename="features.txt"):
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            transaction = []
            for value in row:
                if value != None:
                    value = str(value).replace(" ", "")
                    parts = [v.strip() for v in str(value).split(',') if v.strip()]
                    transaction.extend(parts)
            if transaction:
                f.write(" ".join(transaction) + "\n")
                
create_transaction_file(df, 'features.txt')                
