## Extracting test data from gdrive

In [1]:
import os 

In [2]:
root_dir = '/content/drive/MyDrive/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en'
test_file_name = 'test.json'
test_data = os.path.join(root_dir, test_file_name)

In [3]:
import json

filehandle = open(test_data, 'r')

test_file = json.load(filehandle)

In [4]:
test_file.keys()

dict_keys(['meeting_en_test_009', 'meeting_en_test_028', 'meeting_en_test_027', 'meeting_en_test_026', 'meeting_en_test_025', 'meeting_en_test_024', 'meeting_en_test_023', 'meeting_en_test_022', 'meeting_en_test_021', 'meeting_en_test_020', 'meeting_en_test_019', 'meeting_en_test_017', 'meeting_en_test_004', 'meeting_en_test_016', 'meeting_en_test_012', 'meeting_en_test_005', 'meeting_en_test_013', 'meeting_en_test_015', 'meeting_en_test_008', 'meeting_en_test_006', 'meeting_en_test_002', 'meeting_en_test_010', 'meeting_en_test_018', 'meeting_en_test_011', 'meeting_en_test_003', 'meeting_en_test_014', 'meeting_en_test_001', 'meeting_en_test_007'])

In [5]:
def zip_documents(roles, utterances):
  sentences = zip(roles, utterances)
  extraction = [f'{role}: {utterance}' for role, utterance in sentences]
  return extraction

In [6]:
skipped_files = []
processed_files = dict()
length_dict = dict()
for document in test_file.keys():
  roles = test_file[document]['roles']
  utterances = test_file[document]['utterances']
  if len(roles) != len(utterances):
    skipped_files.append(document)
    continue
  processed_files[document] = dict()
  processed_files[document]['transcript'] = zip_documents(roles, utterances)
  length_dict[document]= len(processed_files[document]['transcript'])

In [7]:
def doc_partitioning(document, max_characters=1700):
  
  processed_dict = dict()
  processed_dict['part_0'] = ''
  identity_generator = 'part_'
  temp = ''
  count = 0
  for sentence in document:
    key = f'{identity_generator}{count}'
    temp = temp + sentence
    if len(temp) > max_characters:
      temp = ''
      count = count + 1
      key = f'{identity_generator}{count}'
      processed_dict[key] = ''
    processed_dict[key] = processed_dict[key] +'\n'+  sentence

  return processed_dict

In [8]:
!pip install transformers



In [9]:
from transformers import pipeline

summarizer = pipeline("summarization", model="lidiya/bart-large-xsum-samsum")

In [10]:
def apply_summarizer(processed_dict):
  Output = []
  for key in processed_dict.keys():
    result = summarizer(processed_dict[key])
    Output.append(result[0]['summary_text'])

  return Output

def split_Sentences(summarizer_output):
  
  Result = []

  for text in summarizer_output:
    sentences = text.split('.')
    for sentence in sentences:
      sentence = sentence.strip()
      if sentence != '':
        Result.append(sentence)

  return Result

In [11]:
import datetime 

def return_date():

  Date = f'Date: {datetime.datetime.now().strftime("%Y-%m-%d")}'

  return Date

In [12]:
def main_body(output):

  body = []

  body_header = 'SUMMARY- \n'
  for sentence in output:
    
    if len(sentence.split(' ')) > 3:
      body.append(f'-{sentence}')
    
  body_content = '\n'.join(body)  
  body_Block = f'{body_header}{body_content}'
  
  return body_Block

In [13]:
import re

def generate_person_list(Output):

  reg_ex = [r'PERSON[0-9]{1,2}']
  person_list = []

  for sentence in Output:
    result = re.search(reg_ex[0], sentence)
    try:
      person_list.append(result.group())
    except:
      pass

  return list(set(person_list))

def generate_attendes(person_list):

  attendee_header = 'ATTENDEES: '
  attendee_content = attendee_header + ', '.join(person_list)

  return attendee_content

In [14]:
def prepare_document(attendee_str, body, annotator = 'Nidhir'):

  Date_ = return_date()
  Document = f'{Date_}\n{attendee_str}\n\n\n{body}\n\nMinuted by: {annotator}'

  return Document

In [15]:
cd /content/drive/MyDrive/task_A_test_results

/content/drive/MyDrive/task_A_test_results


In [16]:
# main pipeline

# Allocate transcript
key = 'meeting_en_test_025'
transcript = processed_files[key]['transcript']

# document partitioning
processed_dict = doc_partitioning(transcript, max_characters=2000)

# apply summarizer to processed transcript
Output = apply_summarizer(processed_dict)
Output = split_Sentences(Output)

# Minute parts
person_List = generate_person_list(Output)
Attendees = generate_attendes(person_List) 
Main_body = main_body(Output)

# Assemle parts and preparing final minute:
DOCUMENT = prepare_document(Attendees, Main_body)
print(DOCUMENT)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Date: 2021-07-24
ATTENDEES: PERSON7, PERSON4, PERSON5, PERSON2, PERSON12, PERSON1, PERSON3, PERSON6


SUMMARY- 
-PERSON1 was in a call until the very last minute and his machine crashed, so he had to restart it
-Now he can hear people again
-His mother is writing the question for the German subti subti subtitle user study and she has some progress
-PERSON4 started to write the deliverables and asked PERSON5 to do the transcripts of by German ASR f for the German transcriber
-PERSON5 has just added the transcripts for the first 30 parts and for the next ten parts, but for some folders the audio format is wrong
-PERSON1 and PERSON5 want to implement natural shortening in their models
-PROJECT4 is not ready for speech translation
-PERSON1, PERSON5, PERSON2, PERSON6 and PERSON3 are looking for students for the PROJECT4 project
-PERSON12 works halfway in Prague and halfway in Brno, but he is working on other things
-PERSON1, PERSON6 and PERSON3 will meet tomorrow in the afternoon to have a 