In [None]:
from google.colab import drive
import os
from zipfile import ZipFile
import json
import spacy
from spacy.matcher import Matcher

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_folder_path = '/content/drive/MyDrive/Temporal Chatbot BTP/BTP Datasets'

In [None]:
for f in os.listdir(dataset_folder_path):
  if f.endswith('.zip'):
    print(f'Extracting {f[:-4]}')
    with ZipFile(os.path.join(dataset_folder_path, f), 'r') as zObject:
	    zObject.extractall(path=os.path.join(dataset_folder_path, f[:-4]))

Extracting MSRSocialMediaConversationCorpus
Extracting SBCorpus
Extracting switchboard
Extracting swda
Extracting personachat
Extracting nalue


In [None]:
for f in os.listdir(dataset_folder_path):
  print(f)

BIGBENCH_temporal_ordering.json
BIGBENCH_Tense.json
BIGBENCH_logical_sequence.json
BIGBENCH_cause1.json
BIGBENCH_cause2.json
BIGBENCH_cause3.json
human_chat.txt
MSRSocialMediaConversationCorpus.zip
amazon-qa
friends
swb1_dialogact_annot.tar.gz
SBCorpus.zip
switchboard.zip
swda.zip
personachat.zip
dstc8
conai_3
dstc2
topical_chat
nalue.zip
multiWOZ2.2
CoQA
MSRSocialMediaConversationCorpus
SBCorpus
switchboard
swda
personachat
nalue


In [None]:
with open(os.path.join(dataset_folder_path, 'human_chat.txt'), 'r') as f:
  lines = f.readlines()

In [None]:
human1, human2 = [], []
for dialog in lines:
  if dialog.startswith('Human 1:'):
    human1.append(dialog[10:].strip())
  elif dialog.startswith('Human 2:'):
    human2.append(dialog[10:].strip())

In [None]:
len(human1), len(human2)

(764, 730)

In [None]:
with open(os.path.join(dataset_folder_path, 'BIGBENCH_temporal_ordering.json'), 'r') as f:
  temporal_ordering_data = json.load(f)

In [None]:
temporal_ordering_data.keys()

dict_keys(['canary', 'name', 'description', 'keywords', 'preferred_score', 'metrics', 'example_input_prefix', 'example_output_prefix', 'append_choices_to_input', 'examples'])

In [None]:
events = []
for i in temporal_ordering_data['examples']:
  events.append({
      'input': "\n".join(i['input'].split('\n')[2:-1]),
      'output': i['target']
  })

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(events[0]['input'])

In [None]:
print(events[0]['input'])

Susan woke up at 7am.
Linda saw Susan driving to the water park from 7am to 11am.
John saw Susan buying clothes at the mall from 11am to 12pm.
Jessica saw Susan taking photos near the Eiffel Tower from 12pm to 1pm.
Steven saw Susan buying lunch at the deli from 1pm to 2pm.
Thomas saw Susan reading at the library from 2pm to 6pm.
The coffee shop was closed after 9pm.


In [None]:
for chunk in doc.noun_chunks:
    print(chunk.text)
    print()

Susan

am

Linda

Susan

the water park

7am

am

John

Susan

clothes

the mall

11am

12pm

Jessica

Susan

photos

the Eiffel Tower

12pm

pm

Steven

Susan

lunch

the deli

1pm to 2pm

Thomas

Susan

the library

2pm to 6pm

The coffee shop

9pm

what times

Susan

the coffee shop



In [None]:
verbs = ["VERB", "AUX"]
for token in doc:
    if token.pos_ in verbs:
        print (token.text, token.pos_)

woke VERB
saw VERB
driving VERB
saw VERB
buying VERB
saw VERB
taking VERB
saw VERB
buying VERB
saw VERB
reading VERB
was AUX
closed VERB
could AUX
have AUX
gone VERB


In [None]:
for ent in doc.ents:
    print (ent.text, ent.label_)

Susan PERSON
7am TIME
Linda PERSON
Susan PERSON
7am to 11am TIME
John saw Susan PERSON
11am to 12pm TIME
Jessica PRODUCT
Susan PERSON
the Eiffel Tower FAC
12pm to 1pm TIME
Steven PERSON
Susan PERSON
1pm to 2pm TIME
Thomas PERSON
Susan PERSON
2pm to 6pm TIME
9pm TIME
Susan PERSON


In [None]:
with open(os.path.join(dataset_folder_path,'friends_total_transcript.json'), 'r') as f:
  total_transcript = json.load(f)

In [None]:
nlp_matcher = spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_sm')

In [None]:
verb_matcher = Matcher(nlp_matcher.vocab)
action_verb_pattern = [
    [
        {"POS": "AUX", "OP":"?"},
        {"POS": "VERB", "OP": "+"}
    ],
    [
        {"POS": "AUX", "OP":"?"},
        {"POS": "VERB"},
        {"POS": "DET", "OP": "?"},
        {"POS": {"IN": ["NOUN", "PROPN"]}, "OP":"+"}
    ],
    [
        {"POS": "AUX", "OP":"?"},
        {"POS": "VERB"},
        {"POS": "ADP"}
    ],
    [
        {"POS": "AUX", "OP":"?"},
        {"POS": "VERB"},
        {"POS": "ADP"},
        {"POS": "DET", "OP": "?"},
        {"POS": {"IN": ["NOUN", "PROPN"]}, "OP":"+"}
    ]
]
verb_matcher.add("verb-phrases", action_verb_pattern)

In [None]:
time_matcher = Matcher(nlp_matcher.vocab)
time_pattern = [
    [
        {"ENT_TYPE": "TIME"}
    ]
]
time_matcher.add("time-phrases", time_pattern)

In [None]:
def find_longest_supersequence(values):
  longest_values = [values[0]]
  for value in values[1:]:
    last_value = longest_values[-1]
    if (last_value[1][0] == value[1][0]) and (last_value[1][1] < value[1][1]):
      longest_values.remove(last_value)
      longest_values.append(value)
    elif (last_value[1][0] < value[1][0]) and (last_value[1][1] == value[1][1]):
      pass
    else:
      longest_values.append(value)
  return longest_values

In [None]:
def connect_sequences(values):
  range_values = [values[0]]

  for value in values[1:]:
    last_value = range_values[-1]

    if value[1][0] == last_value[1][1]:
      range_values.remove(last_value)
      value_var = " ".join([str(last_value[0]), str(value[0])])
      range_values.append((value_var, (last_value[1][0], value[1][1])))
    else:
      range_values.append(value)
  return range_values

In [None]:
def event_annotate(input_line):
  doc = nlp(input_line)

  verbs = [(doc[match[1]:match[2]], match[1:]) for match in verb_matcher(doc)]
  unique_verbs = find_longest_supersequence(find_longest_supersequence(verbs))
  range_uv = connect_sequences(unique_verbs)

  times = [(doc[match[1]:match[2]], match[1:]) for match in time_matcher(doc)]
  range_time = connect_sequences(times)

  print("INPUT")
  print(input_line)

  event_time_connect = []
  for i in zip(range_uv, range_time):
    event_time_connect.append({
        'event': str(i[0][0]),
        'time': i[1][0]
    })
  return event_time_connect

In [None]:
total_transcript_annot = dict()
for season_num in total_transcript:
    season = total_transcript[season_num]
    for episode_num in total_transcript[season_num]:
        episode = season[episode_num]
        for scene_num in total_transcript[season_num][episode_num]:
            scene = episode[scene_num]
            for utterance_num in range(len(scene)):
              transcript = scene[utterance_num]['transcript']
              speaker = scene[utterance_num]['speaker']

              time_values = event_annotate(transcript)
              total_transcript[season_num][episode_num][scene_num][utterance_num]['time_values'] = time_values

IndexError: list index out of range

In [None]:
total_events = dict()
for event_index in range(len(events)):
  doc = nlp(events[event_index]['input'])

  verbs = [(doc[match[1]:match[2]], match[1:]) for match in verb_matcher(doc)]
  unique_verbs = find_longest_supersequence(find_longest_supersequence(verbs))
  range_uv = connect_sequences(unique_verbs)

  times = [(doc[match[1]:match[2]], match[1:]) for match in time_matcher(doc)]
  range_time = connect_sequences(times)

  print("INPUT")
  print(events[event_index]['input'])

  event_time_connect = []
  for i in zip(range_uv, range_time):
    event_time_connect.append({
        'event': str(i[0][0]),
        'time': i[1][0]
    })
  total_events[event_index] = event_time_connect

  print("OUTPUT")
  for i in event_time_connect:
    print(i['event'], ": ", i['time'])
  print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Hannah saw Richard buying a bike at the bike shop from 5pm to 6pm.
The soccer field was closed after 6pm.
OUTPUT
woke up :  5 am
saw Richard buying lunch :  5 am to 11 am
saw Richard walking in the garden :  1 pm to 4 pm
saw Richard watching a movie :  4 pm to 5 pm
saw Richard buying a bike :  5 pm to 6 pm
was closed after :  6 pm

INPUT
Andrew woke up at 7am.
William saw Andrew attending class at the school from 7am to 9am.
Richard saw Andrew waiting at the train station from 10am to 4pm.
Thomas saw Andrew buying a phone at the electronics store from 4pm to 8pm.
The gas station was closed after 8pm.
OUTPUT
woke up :  7 am
saw Andrew attending class :  7 am to 9 am
saw Andrew waiting at the train station :  10 am to 4 pm
saw Andrew buying a phone :  4 pm to 8 pm
was closed after :  8 pm

INPUT
Mark woke up at 5am.
Ashley saw Mark stretching at a yoga studio from 5am to 10am.
Anthony saw Mark buying a bike at the bike shop

In [None]:
total_events[0]

[{'event': 'woke up', 'time': '7 am'},
 {'event': 'saw Susan driving to the water park', 'time': '7 am to 11 am'},
 {'event': 'saw Susan buying clothes', 'time': '11 am to 12 pm'},
 {'event': 'saw Susan taking photos', 'time': '12 pm to 1 pm'},
 {'event': 'saw Susan buying lunch', 'time': '1 pm to 2 pm'},
 {'event': 'saw Susan reading at the library', 'time': '2 pm to 6 pm'},
 {'event': 'was closed after', 'time': '9 pm'}]

In [None]:
# pass event_time_connect through heidel_time to get proper values for the time values
with open('temporal_ordering_split.json', 'w') as f:
  json.dump(total_events, f)

In [None]:
time_instances = [{'text': '7am.', 'value': 'XXXX-XX-XXT07:00', 'span': (4, 5)},
 {'text': '7am', 'value': 'XXXX-XX-XXT07:00', 'span': (14, 15)},
 {'text': '11am.', 'value': 'XXXX-XX-XXT11:00', 'span': (16, 17)},
 {'text': '11am', 'value': 'XXXX-XX-XXT11:00', 'span': (26, 27)},
 {'text': '12pm.', 'value': 'XXXX-XX-XXT24:00', 'span': (28, 29)},
 {'text': '12pm', 'value': 'XXXX-XX-XXT24:00', 'span': (39, 40)},
 {'text': '1pm.', 'value': 'XXXX-XX-XXT13:00', 'span': (41, 42)},
 {'text': '1pm', 'value': 'XXXX-XX-XXT13:00', 'span': (51, 52)},
 {'text': '2pm.', 'value': 'XXXX-XX-XXT14:00', 'span': (53, 54)},
 {'text': '2pm', 'value': 'XXXX-XX-XXT14:00', 'span': (62, 63)},
 {'text': '6pm.', 'value': 'XXXX-XX-XXT18:00', 'span': (64, 65)},
 {'text': '9pm.', 'value': 'XXXX-XX-XXT21:00', 'span': (71, 72)}]

In [None]:
friends_dataset = os.path.join(dataset_folder_path, 'friends')

In [None]:
all_data = os.listdir(friends_dataset)

In [None]:
total_friends_data = dict()

In [None]:
for i in range(len(all_data)):
  with open(os.path.join(friends_dataset, all_data[i]), 'r') as f:
    total_friends_data[i+1] = json.load(f)

In [None]:
total_friends_data.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [None]:
for f in os.listdir(friends_dataset):
  print(f)

friends_season_01.json
friends_season_02.json
friends_season_03.json
friends_season_04.json
friends_season_05.json
friends_season_06.json
friends_season_07.json
friends_season_08.json
friends_season_09.json
friends_season_10.json


In [None]:
with open(os.path.join(friends_dataset, 'friends_season_01.json'), 'r') as f:
  friends_season1 = json.load(f)

In [None]:
friends_season1.keys()

dict_keys(['season_id', 'episodes'])

In [None]:
(friends_season1['episodes'][0]['episode_id'])

's01_e01'

In [None]:
(friends_season1['episodes'][0]).keys()

dict_keys(['episode_id', 'scenes'])

In [None]:
(friends_season1['episodes'][0]['scenes'][0])

In [None]:
(friends_season1['episodes'][0]['scenes'][0]).keys()

dict_keys(['scene_id', 'utterances', 'plots', 'rc_entities', 'span_qa'])

In [None]:
(friends_season1['episodes'][0]['scenes'][0]['utterances'][0])

{'utterance_id': 's01_e01_c01_u001',
 'speakers': ['Monica Geller'],
 'transcript': "There's nothing to tell! He's just some guy I work with!",
 'tokens': [['There', "'s", 'nothing', 'to', 'tell', '!'],
  ['He', "'s", 'just', 'some', 'guy', 'I', 'work', 'with', '!']],
 'transcript_with_note': None,
 'tokens_with_note': None,
 'character_entities': [[],
  [[0, 1, 'Paul the Wine Guy'],
   [4, 5, 'Paul the Wine Guy'],
   [5, 6, 'Monica Geller']]]}

In [None]:
total_transcript = dict()
for season_num in total_friends_data:
  total_transcript[season_num] = dict()

  season = total_friends_data[season_num]
  for episode_num in range(len(season['episodes'])):
    episode_key = episode_num + 1
    total_transcript[season_num][episode_key] = dict()

    episode = season['episodes'][episode_num]
    for scene_num in range(len(episode['scenes'])):
      scene_key = scene_num + 1
      total_transcript[season_num][episode_key][scene_key] = list()

      scene = episode['scenes'][scene_num]
      for utterance in scene['utterances']:
        speaker = utterance['speakers']
        transcript = utterance['transcript']

        total_transcript[season_num][episode_key][scene_key].append({
            "speaker": speaker,
            "transcript": transcript
        })

In [None]:
total_transcript

In [None]:
total_transcript[1][1][1]

[{'speaker': ['Monica Geller'],
  'transcript': "There's nothing to tell! He's just some guy I work with!"},
 {'speaker': ['Joey Tribbiani'],
  'transcript': "C'mon, you're going out with the guy! There's gotta be something wrong with him!"},
 {'speaker': ['Chandler Bing'],
  'transcript': 'All right Joey, be nice. So does he have a hump? A hump and a hairpiece?'},
 {'speaker': ['Phoebe Buffay'], 'transcript': 'Wait, does he eat chalk?'},
 {'speaker': [], 'transcript': ''},
 {'speaker': ['Phoebe Buffay'],
  'transcript': "Just, 'cause, I don't want her to go through what I went through with Carl- oh!"},
 {'speaker': ['Monica Geller'],
  'transcript': "Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex."},
 {'speaker': ['Chandler Bing'], 'transcript': 'Sounds like a date to me.'},
 {'speaker': [], 'transcript': ''},
 {'speaker': ['Chandler Bing'],
  'transcript': "Alright, so I'm back in high school, I'm standing in the middle of