In [3]:
import pandas as pd 

In [4]:
def assign_positions(df: pd.DataFrame, target_col: str) -> None:
    """Modifies the input dataframe. Customly assigns positions to target column items. 
    
    If the item in target column is repeated consequently, the position is the same. If the item is repeated after item of other name, 
    the position is increased by 1. NB: target column is modified in-place.

    Args:
        df: Pandas dataframe with dialogs, dialog ids (must be 'dialog_id') and target column (goal/speech function/dialog act).
        target_col: The name of the target column to which positions are assigned.
    """

    position_counter = {}
    new_names = []
    prev_id = None
    for idx, row in df.iterrows():
        if prev_id != row['dialog_id']:
            position_counter = {} 
            prev_id = row['dialog_id']
        
        if row[target_col] not in position_counter:
            position_counter[row[target_col]] = 1
        else:
            if row[target_col] != df.iloc[idx - 1][target_col]:
                position_counter[row[target_col]] += 1
                
        new_names.append(f"{row[target_col]}_{position_counter[row[target_col]]}")

    df[target_col] = new_names
    

def cut_open_goal(goal: str) -> str:
    """Preprocesses open goals by leaving the first word and renaming some of them.

    Used for preprocessing goals from open goal annotation datasets. 
    Simplified for now, more preprocessing tbd later.

    Args:
        goal: A goal.

    Returns:
      A preprocessed goal.
    """
    
    if type(goal) == str:
        if 'agreement' in goal.split()[0]:
            goal_processed = 'agrees'
        else:
            goal_processed = goal.split()[0]
    else:
        goal_processed = ''
    return goal_processed


def large_table_to_goals_df(file_name: str, anno_type: str) -> None:
    """REWRITE

    Args:
        file_path: Path to the file to be preprocessed. The file must be of tsv format.
        anno_type: The type of annotation. Must be 'durec' or 'open_goal'.

    Returns:
        A preprocessed pandas DataFrame with columns 'id_and_goal' (dialog id, dialog summary in terms of goals), 
        'utt_id', 'speaker', 'utt', goal column.
    """

    if anno_type == 'durec':
        target_col = 'durec_goal_1'
    elif anno_type == 'open_goal':
        target_col = 'open_goal'
    else:
        raise TypeError("You specified a non-existent anno_type. anno_type must be 'durec' or 'open_goal'")
    data = pd.read_csv(file_name, sep='\t')
    data_smaller = data.copy()[['dialog_id', 'utt_id', 'speaker', 'utt', target_col]]
    if anno_type == 'open_goal':
        data_smaller.loc[:, 'open_goal_cut'] = data_smaller['open_goal'].apply(cut_open_goal)
        assign_positions(data_smaller, 'open_goal_cut')
        df_grouped = data_smaller.groupby('dialog_id')['open_goal_cut'].apply(list).reset_index(name='goal_list')
    else:
        assign_positions(data_smaller, target_col)
        df_grouped = data_smaller.groupby('dialog_id')[target_col].apply(list).reset_index(name='goal_list')
    df_grouped['goal_list'] = df_grouped['goal_list'].apply(lambda items: list(dict.fromkeys(items)))
    dialog_id = df_grouped['dialog_id'].map(str)
    list_goals = df_grouped['goal_list'].apply(lambda x: ' '.join([f'[{index+1}] {str(value)}' for index,value in enumerate(x)]))
    df_grouped['id_and_goal'] = dialog_id + ': ' + list_goals
    if anno_type == 'open_goal':
        df_final = data_smaller.merge(df_grouped, how="outer")[['id_and_goal', 'utt_id', 'open_goal_cut', target_col, 'speaker', 'utt']]
    else:
        df_final = data_smaller.merge(df_grouped, how="outer")[['id_and_goal', 'utt_id', target_col, 'speaker', 'utt']]
    return df_final

In [7]:
durec_goals_df = large_table_to_goals_df('dialog_data/dd_annotation_results.tsv', 'durec')
open_goals_df = large_table_to_goals_df('dialog_data/dd_annotation_results.tsv', 'open_goal')

In [9]:
durec_goals_df.to_csv('dialog_data/processed/durec_fin.tsv', sep='\t')
open_goals_df.to_csv('dialog_data/processed/open_fin.tsv', sep='\t')