In [1]:
import pandas as pd
import numpy as np

from itertools import chain, combinations

from datetime import datetime
from datetime import timedelta

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

df = pd.read_csv('./transcriptions/dataframes/De_Dienst_afl_1.csv', sep=';')
df['start'] = pd.to_timedelta(df['start'])
df['end'] = pd.to_timedelta(df['end'])
df['DI'] = np.nan
df['label'] = np.nan

# print(df.head())

label_dict = {0:'None', 1:'recall', 2:'analysis', 3:'selection', 4:'request'}
label_list = [None]*df.index.size



def label_df(df, start_index, end_index):
    label_list = [None]*df.index.size
    # print('help')

    
    l_powerset = set(powerset(label_dict.keys()))
    print(l_powerset)
    
    di_range = df.loc[start_index : end_index]
    # print(di_range)
    for i, r in di_range.iterrows():

        if i > 0:
            print(f"{df.loc[i-1]['speaker']}: {df.loc[i-1]['transcription']}")    
        
        print(f" --> {r['speaker']}: {r['transcription']}")

        if i < df.index.size-1:
            print(f"{df.loc[i+1]['speaker']}: {df.loc[i+1]['transcription']}")  
        label = list(input(f'select one or more labels: {label_dict}').split(','))
        print(label)
        label = tuple([int(i) for i in label])
        while label not in l_powerset:
            label = list(input(f'Selected index {label} is outside the scope. Please select one or more from this list: {label_dict}.').split(','))
            label = tuple([int(i) for i in label])

        label_list[i] = str(label)
    print(di_range.index.values)
    
    print(label_list)

    label_df = pd.DataFrame(index=df.index, data={'label':label_list})
    print(label_df.index.values)
    return label_df

DIs = {
    # 'DI01': {'episode': 1, 'start':timedelta(minutes=15, seconds=31), 'end': timedelta(minutes=17, seconds=52)},
    # 'DI02': {'episode': 1, 'start':timedelta(minutes=18, seconds=13), 'end': timedelta(minutes=27, seconds=11)},
    'DI03': {'episode': 2, 'start':timedelta(minutes=3, seconds=45), 'end': timedelta(minutes=8, seconds=2)},
    'DI04': {'episode': 2, 'start':timedelta(minutes=8, seconds=8), 'end': timedelta(minutes=16, seconds=19)},
    'DI05': {'episode': 3, 'start':timedelta(minutes=2, seconds=26), 'end': timedelta(minutes=14, seconds=37)},
    'DI06': {'episode': 4, 'start':timedelta(minutes=0, seconds=36), 'end': timedelta(minutes=16, seconds=12)},
    'DI07': {'episode': 4, 'start':timedelta(minutes=18, seconds=5), 'end': timedelta(minutes=34, seconds=7)},
    'DI08': {'episode': 5, 'start':timedelta(minutes=1, seconds=7), 'end': timedelta(minutes=7, seconds=31)},
    'DI09': {'episode': 5, 'start':timedelta(minutes=15, seconds=5), 'end': timedelta(minutes=17, seconds=11)},
    'DI10': {'episode': 5, 'start':timedelta(minutes=27, seconds=2), 'end': timedelta(minutes=28, seconds=22)},
    }

episode_files = {
    1: './transcriptions/dataframes/De_Dienst_afl_1.csv',
    2: './transcriptions/dataframes/De_Dienst_afl_2.csv',
    3: './transcriptions/dataframes/De_Dienst_afl_3.csv',
    4: './transcriptions/dataframes/De_Dienst_afl_4.csv',
    5: './transcriptions/dataframes/De_Dienst_afl_5.csv',
    6: './transcriptions/dataframes/De_Dienst_afl_6.csv', 
}


def write_focus_group_text(df, start_index, end_index, di):
    relevant_part = df.loc[df['DI'] == di]
    # print(relevant_part['label'].values)
    file = di
    full_path = "./transcriptions/focus_group_text/" + file + '.txt'
    with open(full_path, "w") as text_file:
        text_file.write(f'Decision instance: {di}\n')
        text_file.write('\n---------- RECALL ----------\n')
        sharing_part = df.loc[df['label'].str.contains('1', na=False) & (df['DI'] == di)]
        # print(relevant_part)
        # print(sharing_part)
        for i, row in sharing_part.iterrows():
            print(row)
            line = f"{row['speaker']}({row['label']}): {row['transcription']}\n"
            text_file.write(line)
        text_file.write('\n---------- ANALYSIS ----------\n')
        analysis_part = df.loc[df['label'].str.contains('2', na=False) & (df['DI'] == di)]
        for i, row in analysis_part.iterrows():
            line = f"{row['speaker']}({row['label']}): {row['transcription']}\n"
            text_file.write(line)

current_episode = 0
# df = pd.read_csv(episode_files[1], sep=';')
# df['start'] = pd.to_timedelta(df['start'])
# df['end'] = pd.to_timedelta(df['end'])

for di, di_info in DIs.items():
    if current_episode != di_info['episode']:
        if current_episode != 0:
            df.to_csv(f'./transcriptions/dataframes_labeled/{episode_files[current_episode].split("/")[-1]}', sep=';')
            write_focus_group_text(df, 10, 100, di)
        current_episode = di_info['episode']
        df = pd.read_csv(episode_files[current_episode], sep=';')
        df['start'] = pd.to_timedelta(df['start'])
        df['end'] = pd.to_timedelta(df['end'])
        df['DI'] = ""
        df['label'] = ""
    start_index = df[df['start'] >= di_info['start']].index[0]
    end_index = df[df['end'] <= di_info['end']].index[-1]
    di_labels = label_df(df, start_index, end_index)
    df.loc[start_index:end_index, 'label'] = di_labels
    df.loc[start_index:end_index, 'DI'] = di
    # print(df['label'].values)
    write_focus_group_text(df, start_index, end_index, di)


df.to_csv(f'./transcriptions/dataframes_labeled/{episode_files[current_episode].split("/")[-1]}', sep=';')


{(0, 1, 3, 4), (2,), (3, 4), (0, 1, 3), (1, 2, 3, 4), (0, 2), (0, 1, 2, 3, 4), (1, 3), (0, 2, 4), (0, 1, 2, 4), (4,), (1,), (0, 2, 3, 4), (0, 1, 2), (1, 2, 4), (0, 1), (2, 4), (1, 2), (0, 4), (0, 2, 3), (0, 1, 2, 3), (0,), (3,), (0, 1, 4), (0, 3, 4), (0, 3), (1, 2, 3), (1, 4), (1, 3, 4), (2, 3), (2, 3, 4), ()}
Liesbeth:  Waar ik de komende afleveringen als bewerker keuzes in moet gaan maken. Dat begint dus allemaal bij een telefoontje van een bezorgde burger.
 --> Bart:  Er is een melding binnengekomen. Bij ons frontoffice. Daar hebben we het volgens mij net al even aan gestipt. Ons frontoffice is het punt waar alle inlichtingen bij ons binnenkomen. En waar ook een eerste oordeel gemaakt wordt. Welk team zou dit kunnen? Is dit belangrijk genoeg? En uiteindelijk is dit een casus. Waarvan het frontoffice in ieder geval geoordeeld heeft. Dit moet bij dit CT-team terechtkomen.
Liesbeth:  En het CT-team, dat is een contraterrorisme-team. Het is een melding vanuit het land.
['']


ValueError: invalid literal for int() with base 10: ''

In [40]:
write_focus_group_text(df, 10, 100, 'di01')

./transcriptions/dataframes/De_Dienst_afl_1.csv


In [48]:
df

Unnamed: 0,seg_id,start,end,speaker,transcription,translation,DI
0,De_Dienst_afl_1_seg_0,0 days 00:00:00,0 days 00:00:21,Bart,"De eerste keer dat ik een telefoon tap, dat i...","The first time I heard a phone call, that I c...",
1,De_Dienst_afl_1_seg_1,0 days 00:00:30,0 days 00:01:11,Liesbeth,Zoals vrijwel iedereen in Nederland ben ik on...,"Like almost everyone in the Netherlands, I am...",
2,De_Dienst_afl_1_seg_2,0 days 00:01:11,0 days 00:01:25,Bart,Er is een melding binnengekomen. Tijdens een ...,"A message has arrived. During a practice, it ...",
3,De_Dienst_afl_1_seg_3,0 days 00:01:27,0 days 00:01:43,Liesbeth,Je hoort de stem van Bart. Hij is teamhoofd b...,You can hear Bart's voice. He is head of the ...,
4,De_Dienst_afl_1_seg_4,0 days 00:01:43,0 days 00:01:44,Bart,Kan ik je vertellen?,I find it all quite exciting.,
...,...,...,...,...,...,...,...
99,De_Dienst_afl_1_seg_99,0 days 00:26:27,0 days 00:26:58,Ton,Want voor hetzelfde geld hebben andere teams ...,... because for the same money ... ... you kn...,
100,De_Dienst_afl_1_seg_100,0 days 00:26:58,0 days 00:27:11,Liesbeth,"Oké, dus ik moet steeds een gepast middel zoe...","... then it is not allowed ... Okay, so I alw...",
101,De_Dienst_afl_1_seg_101,0 days 00:27:12,0 days 00:28:09,Ton,"Ik zou eigenlijk zeggen, blijf je vooral afvr...","... does Ton have one last tip for me? Well, ...",
102,De_Dienst_afl_1_seg_102,0 days 00:28:10,0 days 00:28:19,Music,MUZIEK,"... and what you want to get out of it. Okay,...",


In [44]:
di_range[['DI']]


Unnamed: 0,DI
62,DI02
63,DI02
64,DI02
65,DI02
66,DI02
67,DI02
68,DI02
69,DI02
70,DI02
71,DI02


In [7]:
print(label_list)

[(0,), (0,), (1,), (1,), (1,), (0,), (0,), (0,), (1,), (0,), (1, 2), (2,), (1,), (0,), (1,), (1, 2), (1,), (0,), (1,), (2,), (1,), (1, 2), (1,), (0,), (1,), (0,), (1,), (0,), (1,), (0,), (0,), (0,), (1,), (0,), (1,), (0,), (1,), (1, 2), (0,), (1,), (0,), (1,), (0,), (1,), (1, 2), (1,), None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
