### Loading Libraries

In [54]:
import os
import json

In [90]:
import random
import string

In [4]:
import pandas as pd

In [5]:
import requests

In [6]:
from bs4 import BeautifulSoup as bs

In [228]:
from urllib.parse import quote

In [275]:
from nltk.tag import StanfordPOSTagger, StanfordNERTagger

In [262]:
from nltk import word_tokenize

### Stanford POS tagger

In [265]:
## Setting the path to java.exe for nltk
java_path = "F:/java/java/bin/java.exe"
    
os.environ['JAVAHOME'] = java_path

In [286]:
## creating a StanfordPOStagger object
pos_base_path = "E:\Stanford Taggers\stanford-postagger-full-2015-04-20\stanford-postagger-full-2015-04-20"

st_pos = StanfordPOSTagger(model_filename = pos_base_path+"\models\english-bidirectional-distsim.tagger", path_to_jar = pos_base_path+"\stanford-postagger.jar")

In [282]:
## creating a StanfordPOStagger object
ner_base_path = "E:\Stanford Taggers\stanford-ner-4.2.0\stanford-ner-2020-11-17"
    
st_ner = StanfordNERTagger(model_filename = ner_base_path+"\classifiers\english.all.3class.distsim.crf.ser.gz", path_to_jar = ner_base_path+"\stanford-ner-4.2.0.jar")

In [287]:
# st_ner.tag(word_tokenize("Hyderabad is my favorate place"))

In [289]:
# st_pos.tag(word_tokenize("Hyderabad is my favorate place"))

## Functions

In [10]:
# function to return a random plot form data frame with Title and Genre
def sel_plot(df):
    # choosing a random value 
    num = random.choice([100, 150, 200, 250])
    val = random.randrange(1,num)

    print(f"Selected value : {val}")
    
    ## re-framing the data
    plots = {}
    for index,row in df.iterrows():
        plots[row["Title"]] = {"Genre":row["Genre"], "Plot": row["Plot"]}
        if index == val: break
            
    # choosing a random title
    titles = list(plots.keys())
    sel_movie = random.choice(titles)
    
    return plots[sel_movie]

In [375]:
## function to tokenize the plot into Parts of Speech
# func to get abb of pos
def get_abb(tag_obj):
    pos_dct = {}
    ner_dct = {}

    if type(tag_obj) == str: 
        pos_tags = st_pos.tag(tag_obj.split())
        ner_tags = st_ner.tag(tag_obj.split())
    elif type(tag_obj) == list: 
        pos_tags = st_pos.tag(tag_obj)
        ner_tags = st_ner.tag(tag_obj)
        
    for word,tag in pos_tags:
        try:
            pos_dct[word] = tag, abb[tag] 
        except:
            pos_dct[word] = tag 
    for word,tag in ner_tags:
        ner_dct[word] = tag

        
    return {"Parts of Speech": pos_dct, "Named Entities": ner_dct}

# func to get tokens
def tokenizer(plot_obj):
    plot = plot_obj["Plot"]
    get_dct = get_abb(plot)
      
    return plot_obj["Genre"],get_dct

In [377]:
# tokenizer(sel_plot(req_data))

### Fetching Data

In [193]:
## Fetching the movie data
movies_data = pd.read_csv("wiki_movie_plots_deduped.csv", encoding="utf-8")

req_data = movies_data[["Title", "Genre", "Plot"]]
req_data = req_data[req_data["Genre"] != "unknown"]

In [233]:
## Cleaning the data
# cleaning the plot encoding
def clean_str(st):
    for ch in st :
        if ch in '!#$%&\()*+-/:;<=>@[\\]^_`{|}~123456789':
            st = st.replace(ch,"")
    st = st.encode("ascii", "ignore").decode("utf-8")
    return st

req_data["Plot"] = req_data["Plot"].apply(lambda x:clean_str(x))

In [234]:
#saving the required data
req_data.to_csv("data_subset.csv")

### POS tags Abrrevations

In [14]:
url = "https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html"
res = requests.get(url)
soup = bs(res.text)

In [15]:
table = soup.select("table tr")

abb = {}
for row in table[1:]:
    cols = row.find_all("td")
    col_lst = [col.get_text() for col in cols][1:]
    abb[col_lst[0].strip()] = col_lst[1].strip()

## nltk Stopwords

#### creating a word list of stopwords

In [16]:
from nltk.corpus import stopwords

In [113]:
eng_wrds = stopwords.words("english")

In [114]:
eng_abb = get_abb(eng_wrds[:150])

In [115]:
eng_dct = {}

for key,value in eng_abb.items():
    if value[1] not in eng_dct:
        eng_dct[value[1]] = list()
    eng_dct[value[1]].append(key)      

In [468]:
# eng_dct

## StoryGenerator

In [397]:
res = pos_tokenizer(sel_plot(req_data))

Selected value : 65


In [442]:
genre = res[0]
pos_res = res[1]["Parts of Speech"]
ner_res = res[1]["Named Entities"]
words = list(pos_res.keys())

In [480]:
num = random.randrange(5, len(words)//5)
pos_choosen = []
for i in range(num):
    pos_choosen.append(random.choice(words))   
    
ner_choosen = [wrd for wrd,ner in list(ner_res.items()) if ner != "O"]

In [507]:
pos_choosen

['husband.', 'gets', 'rivals,', 'as', 'ball', 'ensues.', 'and']

In [508]:
## Asking user for some words based on their
new_words = []
occurred = [] 
for wrd,tup in pos_res.items():    
    if tup[1] not in occurred :
        if (wrd not in ner_choosen) & (wrd in pos_choosen):  
            occurred.append(tup[1])
            print (f"More on {tup[1]} at: https://www.google.com/search?q={quote(tup[1])}")
            new_word = input(f"Enter a {tup[1]}:")    
            if len(new_word) < 1: new_words.append(wrd)
            new_words.append(new_word)
            print("")
            
        elif wrd in ner_choosen:
            print (f"More on {ner_res[wrd]} at: https://www.google.com/search?q={quote(ner_res[wrd])}")
            new_word = input(f"Enter a {ner_res[wrd]}:")    
            if len(new_word) < 1: new_words.append(wrd)
            new_words.append(new_word)
            print("")
    elif tup[1] in occurred:        
        try:
            new_word = random.choice(eng_dct[tup[1]])
            new_words.append(new_word)
        except:
            new_words.append(wrd) 
    else:
        new_words.append(wrd)

More on LOCATION at: https://www.google.com/search?q=LOCATION
Enter a LOCATION:

More on LOCATION at: https://www.google.com/search?q=LOCATION
Enter a LOCATION:

More on Preposition or subordinating conjunction at: https://www.google.com/search?q=Preposition%20or%20subordinating%20conjunction
Enter a Preposition or subordinating conjunction:

More on Coordinating conjunction at: https://www.google.com/search?q=Coordinating%20conjunction
Enter a Coordinating conjunction:

More on Noun, singular or mass at: https://www.google.com/search?q=Noun%2C%20singular%20or%20mass
Enter a Noun, singular or mass:

More on Verb, 3rd person singular present at: https://www.google.com/search?q=Verb%2C%203rd%20person%20singular%20present
Enter a Verb, 3rd person singular present:



In [509]:
" ".join(new_words)

"New  York  as  and  until above ball  don't s t or s o s don't don't gets  it's with t o is don't is o"