This jupyter notebook contains the ability to select a book from the library and generate key characters and locations within the novel. Additionally, after selecting a chapter and filling in the narrator name The program will generate two types of questions from that chapter. It will additionally show the sentence the question so the user can under what the question is based off of and whether or not they want to use the question in their learning materials. 

In [40]:
#Import libraries 
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
import collections as c

#PLease note that if you do not have spacy large download you will need to change to the small pipeline
nlp = spacy.load("en_core_web_lg")

all_stopwords = nlp.Defaults.stop_words

In [41]:
#Read in dataframe csv from folder
df = pd.read_csv('book_3.csv')

df.head()

Unnamed: 0,chapter_number,chapter_title,tokens,pos,lemma,tag,dep,sentence,title,ent_word,ent_label
0,Chapter 1,t[CE2]he Paper[KL3],best,ADJ,good,JJS,amod,"\n My best friend, Eddie Sullivan, had a ne...",,,
1,Chapter 1,t[CE2]he Paper[KL3],friend,NOUN,friend,NN,appos,"\n My best friend, Eddie Sullivan, had a ne...",,,
2,Chapter 1,t[CE2]he Paper[KL3],Eddie,PROPN,eddie,NNP,compound,"\n My best friend, Eddie Sullivan, had a ne...",,Eddie Sullivan,PERSON
3,Chapter 1,t[CE2]he Paper[KL3],Sullivan,PROPN,sullivan,NNP,appos,"\n My best friend, Eddie Sullivan, had a ne...",Mr. Sullivan,Eddie Sullivan,PERSON
4,Chapter 1,t[CE2]he Paper[KL3],newspaper,NOUN,newspaper,NN,nsubj,"\n My best friend, Eddie Sullivan, had a ne...",,,


# Character Detection

In [42]:
#Single out persons in dataframe
persons_df = df[df['ent_label'] == 'PERSON']
persons_count_df = persons_df.groupby(by = ['ent_word']).count().reset_index()
persons_count_df = persons_count_df.drop(['chapter_number','chapter_title','tokens','lemma','tag','dep','pos','title','sentence'], axis = 1)
persons_count_df.sort_values(by = 'ent_label', ascending = False).head()

Unnamed: 0,ent_word,ent_label
74,Mary Lou,532
62,Little Skinny,274
127,Tommy,262
18,Dad,205
79,McKenzie,193


In [43]:
 # convert list to string 
def listToString(s): 
    str1 = ""  
    for item in s: 
        str1 += item
        str1 += ' '
    return str1 

In [44]:
#This section of code identifies where some one is speaking and counts how many times they do that within the book. 
persons_sentence = persons_df['sentence'].unique().tolist()
persons_string = listToString(persons_sentence)

#creat a list of the characters with the word said before and after to use in the phrase search
persons_word = persons_df['ent_word'].unique().tolist()
terms = []
for persons in persons_word:
    terms.append("{} said".format(persons))
    terms.append("said {}".format(persons))
    
    
matcher = PhraseMatcher(nlp.vocab)

#pattern created based on term lists above
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp(persons_string)

#use the matcher function to find where people speak within the text. 
matches = matcher(doc)
spoken_persons = []

#removes said from the phrase to get a list of names
for match_id, start, end in matches:
    span = str(doc[start:end])
    span2 = span.replace(' said', '')
    span3 = span2.replace('said ', '')
    spoken_persons.append(span3)

#Count the list of names that spoke
spoken_dict = c.Counter(spoken_persons)
spoken_df = pd.DataFrame.from_dict(spoken_dict, orient='index', columns = ['Count']).reset_index()
spoken_df.sort_values(by = 'Count', ascending = False).head(10)

Unnamed: 0,index,Count
1,Dad,40
9,Mary Lou,18
8,McKenzie,17
0,Eddie,16
2,Sam,15
15,Little Skinny,15
5,Sister Ann,11
3,Sullivan,8
11,Glazov,8
10,Mom,7


In [45]:
#Having a list of characters that spoke and mentioned allows to have two different thresholds. 
spoken_hurdle = spoken_df[spoken_df['Count'] > 3]
spoken_hurdle = spoken_hurdle['index'].tolist()
mention_hurdle = persons_count_df[persons_count_df['ent_label'] > 10]
mention_hurdle = mention_hurdle['ent_word'].tolist()

In [46]:
def check_person (person_list):
    book_characters = []
    for person in person_list:
        if person in spoken_hurdle:
            book_characters.append(person)
        elif person in mention_hurdle:
            book_characters.append(person)
            
    return book_characters

In [47]:
persons_list1 = persons_df['ent_word'].unique().tolist()

In [48]:
# Final Character list
characters = check_person (persons_list1)
characters

['Eddie',
 'Little Skinny',
 'McKenzie',
 'Luke',
 'Peter',
 'Dad',
 'Glazov',
 'Sam',
 'Sister Ann',
 'Tommy',
 'Sullivan',
 'Skinny',
 'Mary Lou',
 'Lizzie',
 'Pinky',
 'McCarthy',
 'Scully',
 'Karl Marx',
 'Mary Lou’s',
 'Susie',
 'Mom',
 'Stanton',
 'Ma',
 'Gary Cooper',
 'Lizzie Johnson',
 'Edward R. Murrow',
 'Sister',
 'Little Skinny’s']

## Locations

In [49]:
#Pulling Locations based on LOC and GPE using Spacy
oth_entity_df = df.loc[(df['ent_label'] == 'LOC' ) | (df['ent_label'] == 'GPE' )]
oth_entity_count_df = oth_entity_df.groupby(by = ['ent_word']).count().reset_index()
oth_entity_count_df = oth_entity_count_df.drop(['chapter_number','chapter_title','tokens','lemma','tag','dep','pos','title','sentence'], axis = 1)
oth_entity_count_df.sort_values(by = 'ent_label', ascending = False).head(10)

Unnamed: 0,ent_word,ent_label
3,Busia,16
34,Tivoli,15
33,St. Joe’s,14
5,Chicago,12
46,the Soviet Union,8
47,the United States,8
23,Pa.,7
48,the United States Air Force,4
7,Eastern Europe,4
39,Vienna,4


In [50]:
#create list of noun chunks where sentences have nouns within the dataframe
df_noun = df[df['pos'] == 'NOUN']

sentences = df_noun['sentence'].unique().tolist()

sentence_string = listToString(sentences)

doc = nlp(sentence_string )

noun_chunks = []
for chunk in doc.noun_chunks:
    noun_chunks.append(chunk.text)

In [51]:
#list of location nouns to look for within the noun chunks
place_nouns = ['house', 'office', 'church', 'castle', 'river', 
               'creek', 'beach', 'ocean','palace',
               'forest', 'woods','mountains', 'stream', 'fields',
               'store', 'school', 'hill', 'home', 'zoo', 'island']

In [52]:
#create list of locations where the place nouns are in the noun chunks
common_places = []
for chunk in noun_chunks:
    for place in place_nouns:
        if place in chunk:
            common_places.append(chunk)
#Count frequency of mentions
place_dict = c.Counter(common_places)
place_df = pd.DataFrame.from_dict(place_dict , orient='index', columns = ['Count']).reset_index()
place_df = place_df.rename(columns = {'index' : 'ent_word', 'Count' : 'ent_label'})

#Combined spacy identified locations and common place locations
combined_places = oth_entity_count_df.append(place_df)
#save the top 20 results to be used in questions
top_places = combined_places.sort_values(by = 'ent_label', ascending = False).head(20)

#save key places as a list to be used in question generation
key_places = top_places['ent_word'].unique().tolist()
top_places.head()

Unnamed: 0,ent_word,ent_label
2,school,32
3,Busia,16
34,Tivoli,15
33,St. Joe’s,14
5,Chicago,12


# Dates

In [53]:
#Lists to compare to when writing questions about Dates 
date_exclude = ['Weeks', 'day', 'Tomorrow', 'today','week', 'month', 'years','year']

week_names = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October',  'November', 'December']

cycles = ['daily', 'weekly', 'monthly', 'Annual', 'yearly', 'annually']

seasons = ['winter', 'summer','spring','fall', 'autumn']

time_phrase = ['end of the week', 'end of the month', 'end of the year','middle of the night','end of the day']

time_day = ['morning','noon','afternoon','midnight','dusk','evening']

holidays = ["New Year", "Christmas", "Halloween","4th of July","Fourth of July","Ramadan","Eid al-Fitr",
            "Easter", "Thanksgiving","Passover","Diwali","Lunar New Year","Okoberfest", "Dia de Muertos", "Hanukkah",
           'Bodhi Day','Festa Junina','Maslenitsa',"Tu B'shevat", "Qingming Festival",'Bastille Day',"St. Patrick's Day",
           "Valentine's Day","Kwanzaa","Carnaval","Mardi Gras","Boxing Day","Rosh Hashanah",'Yom Kippur','Sukkot',"Purim",
           'Labor Day','Memorial Day','Christmas Eve']

dates = week_names + months + cycles + seasons + time_phrase + time_day + holidays


# Questions

In [54]:
#select chapter number, make sure it is a string
c_num = '7'

#If there is narrator please denote the name here
narratorname = "Tommy"

if narratorname == '':
    narrator = False
else:
    narrator = True

In [55]:
character_pattern = '|'.join(characters)
places_pattern = '|'.join(key_places)
dates_pattern = '|'.join(dates)

#Creates boolean within the dataframe on whether a sentence contains a character, date, or location. 
df['Character'] = df.sentence.str.contains(character_pattern)
df['Location'] = df.sentence.str.contains(places_pattern)
df['Dates'] = df.sentence.str.contains(dates_pattern)

chapter_df = df[df['chapter_number'] ==  'Chapter '+ c_num]

#use booleans to filter dataframes to sentences that have the necessary criteria
who_questions = chapter_df[(chapter_df['Character'] == True) & (chapter_df['Location'] == True) & (chapter_df['Dates'] == True)]
where_questions = chapter_df[(chapter_df['Character'] == True) & (chapter_df['Location'] == True)]

### Question 1 Who went to location on date?

In [60]:
for sentence in who_questions['sentence'].unique().tolist():
    place = ''
    time = ''
    prep = ''
    for word in key_places:
        if word.lower() in sentence.lower():
            place = word
        else:
            pass
    for date in dates:
        if date.lower() in sentence.lower():
            time = date
            
            #adjusts for different prepositions for different dates
            if time in week_names or time in holidays:
                prep = 'on'
            elif time in time_day or time in seasons:
                prep = 'in the'
            elif time in time_phrase:
                prep = 'at the'
            elif time in months:
                prep = 'in'
            elif time in cycles:
                prep = ''
        else:
            pass
    if place != '' and date != '':
        
        
        print('Question')
        print('Who went to {} {} {}?'.format(place,prep, time))
        print()
        print('Basis of Question: '+ sentence)
        print()
        print()

Question
Who went to McKenzie’s store in the afternoon?

Basis of Question: “I stopped by Mr. McKenzie’s store this afternoon,” she said.




### Question 2 What did character do in location?

In [59]:
for sentence in where_questions['sentence'].unique().tolist():
    who1 = ''
    who2 = ''
    who3 = ''
    place = ''
    prep = ''
    verb = ''
    
    for word in key_places:
        if word.lower() in sentence.lower():
            place = word
        else:
            pass
     
    if narrator == True:
        if ' I ' in sentence:
            who1 = narratorname
        if ' me ' in sentence:
            who1 = narratorname
    

    #handles multiple characters within a sentence
    for who in characters:
        if who.lower() in sentence.lower():
            if who1 == '' :
                who1 = who
            elif who2 == '':
                who2 = who
            elif who3 == '':
                who3 = who
            else:
                pass
            
    #handles phrasing change for when someone is speaking
    if '“' in sentence:
        verb = 'talk about regarding'
    else:
        verb = 'do in'
    

    
    if place != '' and who1 != '':
        
        
        print('Question')
        
        if who2 != '':
            print('What did {} and {} {} {}?'.format(who1, who2,verb, place))
        elif who3 != '': 
            print('What did {}, {} ,and {} {} {}?'.format(who1, who2,who3,verb, place))
        else:
            print('What did {} {} {}?'.format(who1,verb,place))
        print()
        print('Basis of Question: '+ sentence)
        print()
        print()

Question
What did Tommy and Eddie do in school?

Basis of Question: 
    Eddie waited until we were halfway back to school before he elbowed me in the ribs.


Question
What did Tommy and Dad do in home?

Basis of Question: 
    Dad came home early, as Pinky and I were picking up the pieces of the broken vase.


Question
What did Mary Lou and Ma do in Busia?

Basis of Question: With Busia dying and the new baby and now Mary Lou .


Question
What did McKenzie do in McKenzie’s store?

Basis of Question: There was a hardware store, a drugstore, a bakery, Toon Funeral Home and Mr. McKenzie’s store.


Question
What did McKenzie talk about regarding McKenzie’s store?

Basis of Question: “I stopped by Mr. McKenzie’s store this afternoon,” she said.


Question
What did McKenzie do in home?

Basis of Question: Mr. McKenzie was quite sure it was one of the public high school boys who’d taken them, until his son came home from St. Joe’s.”


Question
What did Sam talk about regarding school?

Basis