### Mystery Short Story Idea Generator
By Kenny Drewry & Piyush Makkapati

In [None]:
#import all necessary packages for text processing and random generation

import nltk
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import pandas as pd
import re
import random
from collections import Counter
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [None]:
#Build the data structures for title, first sentence, and last sentence from our data.

#open the data file
with open("short stories.txt", "r") as fh:
    fh_raw = fh.readlines()
    fh.close()
#arrays to hold all the different titles and first and last sentecnes fomr the different mystery stories
titles = []
first = []
last = []
#loop through the data file and append to above lists
#each unique mystery story will have 3 lines associated with it, then 1 blank (must skip a line)
for i in range(0, len(fh_raw) - 3, 4):
  titles.append(fh_raw[i].replace("\n", ''))
  first.append(fh_raw[i+1].replace("\n", ''))
  last.append(fh_raw[i+2].replace("\n", ''))

#the data file has been processed, so each list above must now be cut up into words
titleWords = []
for title in titles:
  #split each title by whitespace
  wordList = title.split()
  #add each word into the word list associated
  for word in wordList:
    titleWords.append(word.lower())

#repeat for first and last sentences
firstWords = []
for line in first:
  wordList = line.split()
  for word in wordList:
    firstWords.append(word.lower())

lastWords = []
for line in last:
  wordList = line.split()
  for word in wordList:
    lastWords.append(word.lower())

In [None]:
#builds a data structure to hold words and what follows them
def nextWord(arr):
  words = {} #dictionary that will hold the words and what follows them (word: [following words])
  #keep count of what index the loop is on so the following word can be gathered
  i = 0
  #loop through each word in the array
  for word in arr:
    #add an entry to dictionary if the word is not already there.
    if words.get(word, -1) == -1:
      #add new entry since word is not in dictionary using the array and index
      try:
        words.update({word: [arr[i+1]]})
      except IndexError:
        a = 0 #do nothing as there is no following word
    else:
      try:
        #add following word to the already existing entry using the array and index
        words[word].append(arr[i+1])
      except IndexError:
        a = 0 #do nothing as there is no following word
    i += 1
  return words

#use the above function to create data structures for title, first sentence, and last sentence using the word lists created above
nextFirst = nextWord(firstWords)
nextLast = nextWord(lastWords)
nextTitle = nextWord(titleWords)

In [None]:
#function to get a random next word from the dictionary of words to following words
def mystery_nextword(term, words):
  try:
    #get list of following words for term
    curWords = words[term]
    #randomly generate a number between 0 and the length of the list of following words
    #to pseudorandomize the picking of the next word in the sequence
    num2 = random.randint(0, len(curWords)-1)
    #use the randomly generated number to get word at the index of the randomly generated number
    curWord = curWords[num2]
    #return the word
    return curWord
  except:
    #return -1 if the word doesn't exist in the dictionary or does not have any following words
    return -1

In [None]:
#generation for titles
def titleGenerator(words):
  #randomly generate a title size
  size = random.randint(3, 8)
  #hold the new title in a list of strings
  title = []
  fir = True
  #generate more words for the title, only while the title is less than the randomly assigned length
  while len(title) < size:
    #about 80% of the time, the titles begin with the word the so the first word is more heavily weighted to be "the"
    if(fir):
      fir = False
      #generate random number to see if "the" will be the chosen word
      num = random.randint(0, len(words)-1)
      #"the" will be  the chosen word only if it is divisible by 2, 3, or 5 to simulate a higher chance of it being selected.
      if num % 2 == 0 or num % 3 == 0 or num % 5 == 0:
        #add word to the output list
        title.append("the")
        #set curWord so the next word can be generated
        curWord = 'the'
      #if the randomly generated number is not divisible, choose another random word to start the title
      else:
        #get all the words in the dictionary and put into a list, so a random one can be selected
        keys = list(words.keys())
        #add word to the output list
        title.append(keys[num])
        #set curWord so the next word can be generated
        curWord = keys[num]
    #only enters if not the first time through while
    #generate next word based on the curWord
    else:
      curWord = mystery_nextword(curWord, words)
      #check to see if a word was actually found, do not append if word is not found
      if(curWord != -1):
        title.append(curWord)
  return title

#generation for (first and last) sentences
def sentenceGenerator(words):
  #make list of strings to hold output
  sentence = []
  fir = True
  #randomly generate a sentence size
  size = random.randint(2,15)
  curWord = ''
  #generate more words for the sentence, only while its length is less than the randomly assigned length
  while len(sentence) < size:
    #upon first iteration, we must generate a random word to build the sentence off of
    if(fir):
      fir = False
      #generate random number to randomize the word chosen
      num = random.randint(0, len(words)-1)
      #get all words in the dictionary into a list to select randomly using index
      keys = list(words.keys())
      #append first word to sentence list
      sentence.append(keys[num])
      curWord = keys[num]
      #generate next word after the first word and append
      curWord = mystery_nextword(curWord, words)
      if(curWord != -1):
        sentence.append(curWord)
    #after first iteration, get new word based on the previous word
    else:
      curWord = mystery_nextword(curWord, words)
      #only append word to sentence list if it exists
      if(curWord != -1):
        sentence.append(curWord)
      else:
        #in case word is not found, generate a completely new word to build from
        fir = True
  #return the list of words for the new sentence
  return sentence

In [None]:
#generate a title, first sentence, and last sentence using the functions above
def generate(nextTitle, nextFirst, nextLast):
  #build output string for the output file
  output = 'Title: '

  #generate the new title and sentences using the next dictionaries
  titleSet = titleGenerator(nextTitle)
  firstSet = sentenceGenerator(nextFirst)
  lastSet = sentenceGenerator(nextLast)

  #each word in the titles and sentences are in a list
  #loop through the list for formatting of the title and sentences
  for word in titleSet:
    #all words are capitalized in the title
    output += word.capitalize() + " "
  output += '\nFirst Sentence: '
  fir = True
  for word in firstSet:
    if(fir):
      #only first word is capitalized in the sentence
      output += word.capitalize() + " "
      fir = False
    else:
      output += word + " "
  output += '\nLast Sentence: '
  fir = True
  for word in lastSet:
    if(fir):
      #only first word is capitalized in the sentence
      output += word.capitalize() + " "
      fir = False
    else:
      output += word + " "
  #add extra newline for readability in file
  output += "\n"

  #return the output string
  return output

#generate a large amount of titles and sentences to place in output file
#each new idea is 3 lines (title, first, last) with a blank line following
for i in range(0, 50):
  with open("output.txt", "a") as output:
    output.write(generate(nextTitle, nextFirst, nextLast))
    output.write("\n")

The following code is used for calculations for the bestsellers data.

In [None]:
with open('/content/drive/MyDrive/491 Final Project/Abandoned in death/Abandoned in death.txt') as b1:
    blob = TextBlob(b1.read())
    temp1 = blob.split()
    print("Character count: " + str(len(blob))) # grab the character count of the entire txt file
    adj = 0
    verb = 0
    for (word, tag) in blob.tags:
      if tag == 'JJ':
        adj = adj + 1
    print("Number of Adjectives: " + str(adj))
    temp = Counter(temp1).most_common(5)
    print(temp)
b1.close()
    
# print(blob.tags)

FileNotFoundError: ignored

In [None]:
# 
with open('/content/drive/MyDrive/491 Final Project/Death on the Nile/(Hercule Poirot 15) Agatha Christie - Death on the Nile (2001).txt') as b2:
    blob = TextBlob(b2.read())
    temp1 = blob.split()
    print("Character count: " + str(len(blob))) # grab the character count of the entire txt file
    adj = 0
    verb = 0
    for (word, tag) in blob.tags:
      if tag == 'JJ':
        adj = adj + 1
    print("Number of Adjectives: " + str(adj))
    temp = Counter(temp1).most_common(5)
    print(temp)
b2.close()


Character count: 170399
Number of Adjectives: 2117
[('the', 985), ('to', 626), ('a', 625), ('of', 556), ('and', 513)]


In [None]:
with open('/content/drive/MyDrive/491 Final Project/The Camel Club/David Baldacci - The Camel Club-Warner Vision Books (2006) (1).txt') as b3:
    blob = TextBlob(b3.read())
    temp1 = blob.split()
    print("Character count: " + str(len(blob))) # grab the character count of the entire txt file
    adj = 0
    verb = 0
    for (word, tag) in blob.tags:
      if tag == 'JJ':
        adj = adj + 1
    print("Number of Adjectives: " + str(adj))
    temp = Counter(temp1).most_common(5)
    print(temp)
b3.close()

Character count: 151376
Number of Adjectives: 2157
[('the', 1446), ('and', 664), ('a', 639), ('to', 589), ('of', 554)]


In [None]:
with open('/content/drive/MyDrive/491 Final Project/The Maid/Nita Prose - The Maid full book (2021) (1).txt') as b4:
    blob = TextBlob(b4.read())
    temp1 = blob.split()
    print("Character count: " + str(len(blob))) # grab the character count of the entire txt file
    adj = 0
    verb = 0
    for (word, tag) in blob.tags:
      if tag == 'JJ':
        adj = adj + 1
    print("Number of Adjectives: " + str(adj))
    temp = Counter(temp1).most_common(5)
    print(temp)
b4.close()

Character count: 134969
Number of Adjectives: 2046
[('the', 1081), ('I', 992), ('to', 641), ('a', 536), ('and', 527)]


In [None]:
with open('/content/drive/MyDrive/491 Final Project/The Thursday Murder club/(1) Richard Osman - Thursday Murder Club (2020) (1).txt') as b5:
    blob = TextBlob(b5.read())
    temp1 = blob.split()
    print("Character count: " + str(len(blob))) # grab the character count of the entire txt file
    adj = 0
    verb = 0
    for (word, tag) in blob.tags:
      if tag == 'JJ':
        adj = adj + 1
    print("Number of Adjectives: " + str(adj))
    temp = Counter(temp1).most_common(5)
    print(temp)
b5.close()

Character count: 122084
Number of Adjectives: 1575
[('the', 912), ('to', 593), ('a', 582), ('and', 526), ('of', 354)]
