In [1]:
import numpy as np
import pickle
from stanza.server import CoreNLPClient
from empath import Empath
import requests
import json
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
lexicon = Empath()
lemmatizer = WordNetLemmatizer()

In [2]:
def create_category(self,name,seeds,model="fiction",size=100,write=True):
#   This function creates new lemmatized lexical categories
    resp = requests.post(self.backend_url + "/create_category", json={"terms":seeds,"size":size,"model":model})
    results = json.loads(resp.text)
    lemma_words = list()
    lemmatizer = WordNetLemmatizer() 
#     Lemmatize the words in response
    for word in results:
        lemma_words.append(lemmatizer.lemmatize(word))
    self.cats[name] = list(set(lemma_words))
#     Permanently store these categories
    if write:
        with open(self.base_dir+"/data/user/"+name+".empath","w") as f:
            f.write("\t".join([name]+results))

Empath.create_lemma_category = create_category

In [3]:
def create_lexicons(rb,lv,fp,ct):
#     This function creates the specified lexical categories with the specified dictionary sizes
    lexicon.create_lemma_category("religious_buildings", ["church","mosque", "temple"], model="fiction", size = rb)
    lexicon.create_lemma_category("loc_verbs", ["arrive", "visit", "travel", "return"], model = "fiction", size= lv)
    lexicon.create_lemma_category("fictional_places", ["place","buildings"], model ="fiction", size =fp)
    lexicon.create_lemma_category("custom_times", ["once_upon_a_time", "next_day","that_evening"], size = ct)

In [4]:
create_lexicons(30,14,300,300)

In [5]:
def nltk_tag_to_wordnet_tag(nltk_tag):
#     This converts nltk tag to wordent tag
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [6]:
def open_and_annotate(story):
#     Opens the story file from the specified path
    file = open('./Panchatantra/'+story+'.txt', errors='ignore')
    text = file.read()
    file.close()
#     CoreNLP Client performs processing on the input text and annotates it
    with CoreNLPClient(annotators = ['tokenize','ssplit'],
        memory='5G', be_quiet=True, outputFormat = 'json', max_char_length=500000, timeout=36000000) as client:
        ann = client.annotate(text)
#     Opens the manually annotated sentence to character map files
    file = open("./Panchatantra/"+story+'_sc.gpickle', 'rb')
    sen_char = pickle.load(file)
    file.close()
    return text, ann, sen_char

In [7]:
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [8]:
def events_by_location_and_time(text,ann):
    """
    Non-hierarchy model
    """
    #This function finds sum of dictionary returned by lexicon.analyze i.e., it finds the presence of location_time words.
    def sum_of_locs_times_dict(dictionary):
        sum_ = 0
        for key in dictionary.keys():
            sum_ = sum_ + dictionary[key]
        return sum_
    
    
    lexicon = Empath()   #Part of code used to bring Empath in
    location_time = None    #The variable place will hold latest location_time word.
    location_time_words = set()
    location_time_by_sentence = dict() #Dictionary to store the location_time-time words in a given sentence
    loc_num = 0 # Will be used to put location_time words as numbers in the location_time_to_number dict
    total_sentences = 0
    
    #Take each sentence of the story one by one (ann.sentence returns individual sentences of the story as objects)
    for i, sentence in enumerate(ann.sentence):
        # Remove comma and fullstop beacuse lexicon.analyze cannot identify words if they are followd by a fullstop or comma.
        # text[characterOffsetBegin:characterOffserEnd] is the actual sentence (as a string) of the sentence object returned
        sentence_for_empath = text[sentence.characterOffsetBegin:sentence.characterOffsetEnd].replace(", "," ").replace(".","").replace("-"," ").replace("?","").replace("!","").replace(":"," ")
        #Lemmatize the words you encounter for better identification when being analysed by lexicon.analyze
        #May be commented out because lexicon.create_category does not give good words when singular words are used
        sentence_for_empath = lemmatize_sentence(sentence_for_empath) # Sentences are all lemmatized now
        # Analyze the things
        lexicon_location_times_dict = lexicon.analyze(sentence_for_empath,
                                                categories=["religious_buildings", "loc_verbs", "fictional_places", "custom_times"])
        location_time_by_sentence[i] = list()
        s = sum_of_locs_times_dict(lexicon_location_times_dict)
        if s>0:
            words = sentence_for_empath.split(" ")
            # Find if place is same as previous
            for word in words:
                # If the word is a location_time word
                if sum_of_locs_times_dict(lexicon.analyze(word,
                                                   categories=["religious_buildings", "loc_verbs", "fictional_places", "custom_times"]))>0:
                    location_time = word
                    if i==0:
                        location_time_words.add(location_time) #stores which words have occured previously
                    else:
#                         check if such a word has occured way too much previously and now refers to a new location or time
                        if (location_time not in location_time_by_sentence[i-1] and location_time in location_time_words):
                            location_time+="1" #slightly modify the word
                            location_time_words.add(location_time) #stores which words have occured previously
                    location_time_by_sentence[i].append(location_time)
        else: 
            if location_time is None:
                location_time_by_sentence[i] = ["UNKNOWN"] #Unknown is the default value
            else:
                location_time_by_sentence[i]=location_time_by_sentence[i-1]
        total_sentences = i
    return location_time_by_sentence, total_sentences

In [9]:
def distance(cluster1, cluster2, sen_char, loc_time):
    char_set1 = set()
    loc_time_set1 = set()
#     Find the first cluster's characters, location and time words and store them as sets
    for i in cluster1:
        if i in sen_char:
            char_set1 = char_set1.union(set(sen_char[i]))
        if i in loc_time:
            loc_time_set1 = loc_time_set1.union(set(loc_time[i]))
    char_set2 = set()
    loc_time_set2 = set()
#     Find the second cluster's characters, location and time words and store them as sets
    for i in cluster2:
        if i in sen_char:
            char_set2 = char_set2.union(set(sen_char[i])) 
        if i in loc_time:
            loc_time_set2 = loc_time_set2.union(set(loc_time[i]))
#     Find the distance between character sets
    char_dist = (len(char_set1.union(char_set2))-len(char_set1.intersection(char_set2)))/(len(char_set1.intersection(char_set2))+1)
#     Find the length of both the clusters combined. It acts as a penalty to not enlarge large clusters
    event_dist = (len(cluster1 + cluster2))
#     Find the distance between location and time sets
    loc_dist = (len(loc_time_set1.union(loc_time_set2))-len(loc_time_set1.intersection(loc_time_set2)))/(len(loc_time_set1.intersection(loc_time_set2))+1)
#     Calculate the overall distance
    dist = (1 + char_dist)*(1 + loc_dist)*(event_dist) 
    return dist

In [10]:
def HAC(text, annotations, sen_char_map, no_split_points):
#     initiate clusters
    location_by_sentence, sentences = events_by_location_and_time(text, annotations)
    clusters = [[i] for i in range(sentences) if i not in no_split_points]
    iterations = [clusters.copy()]
    event_end_points = []
    while len(clusters)>1:
#         calculate the distance between clusters
        dist = np.array([distance(clusters[i], clusters[i+1], sen_char_map, location_by_sentence) for i in range(len(clusters)-1)])
#         find the clusters with minimum distance
        sort = np.argsort(dist) 
#         merge two clusters with minimum distance below a certain threshold
        if dist[sort[0]]<=10:
            index = sort[0]
            clusters[index].extend(clusters[index+1])
            clusters.remove(clusters[index+1])
        else:
            event_end_points = [max(0, cluster[0]-1) for cluster in clusters]
            break
#         variable to form the dendogram
        iterations.append(clusters.copy())
    return event_end_points

In [11]:
# Opens the file containing the story names
story_names = []
file = open("./Panchatantra/Storynames.txt")
file_story_names = file.readlines()
for name in file_story_names:
    story_names.append(name.strip('\n'))
file.close()
# Total sentences present in the respective stories
total_sentences = [66,49,61,12,34,33,38,23,57,38,39]
print(story_names)

['the_story_of_the_merchant_son', 'the_thief_and_the_brahmins', 'the_monkey_and_the_crocodile', 'the_monkey_the_wedge', 'a_daring_plan', 'Buddha_remains_cool', 'moocha_raja', 'raman_horse_trainer', 'talkative_turtle', 'tenali_outwits_guards', 'tenali_the_detective']


In [12]:
scenes = []
# no splitting_points stores the sentences which have dialogues in them and therefore scene splitting cannot be performed in those sentences
no_splitting_points = dict()
for sn in range(len(story_names)):
    text , annotated_story, sen_char_map = open_and_annotate(story_names[sn])
    no_splitting_here = []
    flag=0  #stores the start and end of a dialogue
    for i, sentence in enumerate(annotated_story.sentence):
        sentence = text[sentence.characterOffsetBegin:sentence.characterOffsetEnd].strip("\n").replace(", "," ").replace(".","").replace("-"," ").replace("?","").replace("!","").replace(":"," ").replace("\n\n","").replace("\n \n", '')
        no_of_quotes = sentence.count('"') #the number of double quotes in the sentence
        if no_of_quotes%2 != 0:        
            flag = (flag + 1)%2
        if flag != 0 or (no_of_quotes!=0):
            no_splitting_here.append(i)
    print(story_names[sn])
#     append the scenes after Hierarchical Agglomerative Clustering
    scenes.append(HAC(text, annotated_story, sen_char_map, no_splitting_here))
    print("\n\n\n")


2021-04-25 22:25:48 INFO: Writing properties to tmp file: corenlp_server-5925894f34df47e5.props
2021-04-25 22:25:48 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-5925894f34df47e5.props -annotators tokenize,ssplit -preload -outputFormat serialized


the_story_of_the_merchant_son


2021-04-25 22:25:50 INFO: Writing properties to tmp file: corenlp_server-a1b91bcde00b4d5e.props
2021-04-25 22:25:50 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-a1b91bcde00b4d5e.props -annotators tokenize,ssplit -preload -outputFormat serialized








2021-04-25 22:25:51 INFO: Writing properties to tmp file: corenlp_server-8b1fe2b48b624a4a.props
2021-04-25 22:25:51 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-8b1fe2b48b624a4a.props -annotators tokenize,ssplit -preload -outputFormat serialized


the_thief_and_the_brahmins






2021-04-25 22:25:53 INFO: Writing properties to tmp file: corenlp_server-720fe11f9032443b.props
2021-04-25 22:25:53 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-720fe11f9032443b.props -annotators tokenize,ssplit -preload -outputFormat serialized


the_monkey_and_the_crocodile






2021-04-25 22:25:54 INFO: Writing properties to tmp file: corenlp_server-c7aaa448505f48be.props
2021-04-25 22:25:54 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-c7aaa448505f48be.props -annotators tokenize,ssplit -preload -outputFormat serialized


the_monkey_the_wedge






2021-04-25 22:25:55 INFO: Writing properties to tmp file: corenlp_server-261307cc6aa24fc7.props
2021-04-25 22:25:55 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-261307cc6aa24fc7.props -annotators tokenize,ssplit -preload -outputFormat serialized


a_daring_plan






2021-04-25 22:25:57 INFO: Writing properties to tmp file: corenlp_server-e99cdd2a26504f75.props
2021-04-25 22:25:57 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-e99cdd2a26504f75.props -annotators tokenize,ssplit -preload -outputFormat serialized


Buddha_remains_cool






2021-04-25 22:25:58 INFO: Writing properties to tmp file: corenlp_server-a26e82b48b3d4817.props
2021-04-25 22:25:58 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-a26e82b48b3d4817.props -annotators tokenize,ssplit -preload -outputFormat serialized


moocha_raja






2021-04-25 22:25:59 INFO: Writing properties to tmp file: corenlp_server-da594a9cf95e4f53.props
2021-04-25 22:25:59 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-da594a9cf95e4f53.props -annotators tokenize,ssplit -preload -outputFormat serialized


raman_horse_trainer






2021-04-25 22:26:01 INFO: Writing properties to tmp file: corenlp_server-1cec778b36794c78.props
2021-04-25 22:26:01 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-1cec778b36794c78.props -annotators tokenize,ssplit -preload -outputFormat serialized


talkative_turtle






2021-04-25 22:26:02 INFO: Writing properties to tmp file: corenlp_server-2b26028693ef485a.props
2021-04-25 22:26:02 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-2b26028693ef485a.props -annotators tokenize,ssplit -preload -outputFormat serialized


tenali_outwits_guards




tenali_the_detective






In [13]:
# Sorts the event end points in ascending order
scenes = [sorted(list(set(eve))) for eve in scenes]
for j, eve in enumerate(scenes):
    if 0 in eve:
        eve.remove(0)
    last_index = total_sentences[j]-1
    if last_index not in eve:
        eve.append(last_index)
for sn in range(len(story_names)):
    print(story_names[sn],"\t",scenes[sn],"\n")

the_story_of_the_merchant_son 	 [6, 7, 10, 19, 27, 33, 39, 44, 50, 51, 62, 64, 65] 

the_thief_and_the_brahmins 	 [1, 7, 12, 22, 28, 43, 48] 

the_monkey_and_the_crocodile 	 [9, 15, 25, 50, 60] 

the_monkey_the_wedge 	 [1, 3, 6, 11] 

a_daring_plan 	 [8, 11, 18, 23, 33] 

Buddha_remains_cool 	 [2, 4, 15, 25, 32] 

moocha_raja 	 [1, 2, 8, 16, 22, 28, 30, 37] 

raman_horse_trainer 	 [1, 3, 7, 10, 11, 16, 18, 22] 

talkative_turtle 	 [4, 6, 26, 28, 31, 32, 37, 46, 53, 56] 

tenali_outwits_guards 	 [5, 9, 21, 37] 

tenali_the_detective 	 [38] 



In [None]:
import metrics

In [None]:
# True scene end points based on manual annotation
true_scenes = [[7,9,17,26,31,38,41,45,49,64,65], [1,6,17,22,27,48], [2,9,11,13,24,33,48,58,60], [1,5,6,11], 
               [8,12,18,22,30,33], [2,6,15,28,32], [1,10,13,15,17,22,36,37], [3,5,10,18,22], [3,5,6,25,32,44,56], [1,3,7,12,35,37], 
              [34,38]]
scenes_generated = scenes
sm1 = 0
sm2 = 0
for i in range(len(true_scenes)):
    m1,m2 = metrics.IoU(true_scenes[i], scenes_generated[i], total_sentences[i])
    sm1+=m1
    sm2+=m2
# Average value of metrics
avg_Jaccard = sm1/11
avg_Penalty = sm2/11
avg_F1_score = 2/(1/avg_Jaccard+1/(1-avg_Penalty))
print(avg_Jaccard, avg_Penalty, avg_F1_score)