# Liberaries

In [1]:
import urllib
from bs4 import BeautifulSoup as bs
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, wordnet
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import spacy
import re

# Web Scraping

In [2]:
webpage=urllib.request.urlopen("http://www.scifiscripts.com/cartoon/beautyan.txt")

In [3]:
soup=bs(webpage,"html.parser")

In [4]:
plot=soup.text

In [5]:
plot[:100]

'\nBeauty and the Beast\nThe Complete Script\n\nCompiled by Ben Scripps <34rqnpq@cmuvm.csv.cmich.edu>\n\nNA'

# Data Cleaning

In [6]:
def nltk(text):
    token=word_tokenize(text)
    token_reg=[re.sub("n't","not",i)for i in token]
    token_reg=["".join(re.findall("[a-zA-Z]+",i))for i in token_reg]
    token_lower=[i.lower() for i in token_reg]
    token_lemma=[WordNetLemmatizer().lemmatize(i)for i in token_lower]
    token_no_punct=[i for i in token_lemma if i not in string.punctuation]
    token_no_stop=[i for i in token_no_punct if len(i)>2 if i not in stopwords.words("english")]
    return token_no_stop

In [7]:
processed_tokens=nltk(plot)

In [8]:
# Top Frequent Words used in Transcript
FreqDist(processed_tokens).most_common(15)

[('belle', 332),
 ('beast', 240),
 ('gaston', 173),
 ('lumiere', 133),
 ('cogsworth', 126),
 ('maurice', 121),
 ('look', 78),
 ('lefou', 72),
 ('potts', 67),
 ('see', 63),
 ('back', 62),
 ('chip', 59),
 ('one', 58),
 ('door', 55),
 ('come', 52)]

In [9]:
import gensim

In [10]:
nlp=spacy.load("en_core_web_sm")

In [11]:
# Tokenization
doc=nlp(" ".join(processed_tokens))

In [12]:
doc[:50]

beauty beast complete script compiled ben scripps rqnpq cmuvmcsvcmichedu narrator upon time faraway land young prince lived shining castle although everything heart desired prince spoiled selfish unkind one winter night old beggar woman came castle offered single rose return shelter bitter cold repulsed haggard appearance prince sneered gift turned old

# Topics Extraction

In [13]:
dic=gensim.corpora.Dictionary([processed_tokens])

In [14]:
corpus=[dic.doc2bow(i)for i in [processed_tokens]]

In [15]:
lda_model=gensim.models.ldamodel.LdaModel(corpus,num_topics=10,id2word=dic)

In [16]:
lda_model.show_topics()

[(0,
  '0.029*"belle" + 0.027*"beast" + 0.019*"gaston" + 0.015*"lumiere" + 0.014*"maurice" + 0.010*"cogsworth" + 0.008*"lefou" + 0.008*"potts" + 0.007*"see" + 0.007*"look"'),
 (1,
  '0.023*"belle" + 0.018*"beast" + 0.010*"lumiere" + 0.010*"gaston" + 0.010*"maurice" + 0.009*"cogsworth" + 0.006*"see" + 0.006*"look" + 0.006*"back" + 0.006*"lefou"'),
 (2,
  '0.035*"belle" + 0.026*"beast" + 0.019*"gaston" + 0.018*"lumiere" + 0.013*"cogsworth" + 0.012*"maurice" + 0.009*"lefou" + 0.008*"back" + 0.008*"chip" + 0.007*"one"'),
 (3,
  '0.030*"belle" + 0.015*"beast" + 0.014*"maurice" + 0.012*"gaston" + 0.011*"cogsworth" + 0.011*"lumiere" + 0.009*"look" + 0.008*"lefou" + 0.007*"chip" + 0.006*"potts"'),
 (4,
  '0.037*"belle" + 0.032*"beast" + 0.019*"gaston" + 0.015*"cogsworth" + 0.012*"lumiere" + 0.010*"maurice" + 0.009*"look" + 0.008*"potts" + 0.007*"lefou" + 0.007*"chip"'),
 (5,
  '0.031*"belle" + 0.018*"beast" + 0.016*"gaston" + 0.013*"cogsworth" + 0.012*"maurice" + 0.012*"lumiere" + 0.007*"see" 

# Phrase Modulation

In [17]:
phrase_model=gensim.models.phrases.Phrases(sentences=[processed_tokens],threshold=2,min_count=1)

In [18]:
[i for i in phrase_model.vocab][:20]

['beauty',
 'beast',
 'beauty_beast',
 'complete',
 'beast_complete',
 'script',
 'complete_script',
 'compiled',
 'script_compiled',
 'ben',
 'compiled_ben',
 'scripps',
 'ben_scripps',
 'rqnpq',
 'scripps_rqnpq',
 'cmuvmcsvcmichedu',
 'rqnpq_cmuvmcsvcmichedu',
 'narrator',
 'cmuvmcsvcmichedu_narrator',
 'upon']

In [19]:
a=phrase_model[[processed_tokens]]

In [20]:
# Bigrams applied to transcript
[i for i in a][0][:50]

['beauty_beast',
 'complete',
 'script',
 'compiled',
 'ben',
 'scripps',
 'rqnpq',
 'cmuvmcsvcmichedu',
 'narrator',
 'upon',
 'time',
 'faraway',
 'land',
 'young_prince',
 'lived',
 'shining',
 'castle',
 'although',
 'everything',
 'heart',
 'desired',
 'prince',
 'spoiled',
 'selfish',
 'unkind',
 'one',
 'winter',
 'night',
 'old',
 'beggar',
 'woman',
 'came',
 'castle',
 'offered',
 'single',
 'rose',
 'return',
 'shelter',
 'bitter',
 'cold',
 'repulsed',
 'haggard',
 'appearance',
 'prince',
 'sneered',
 'gift',
 'turned',
 'old_woman',
 'away',
 'warned']

# Topics Exploration

In [21]:
import sklearn

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
CV=CountVectorizer(lowercase=False,max_df=0.8,ngram_range=(1,2))

In [24]:
count_model=CV.fit_transform(processed_tokens)

In [25]:
count_model.toarray().shape,CV.get_feature_names_out().shape

((7664, 2084), (2084,))

In [26]:
CV.vocabulary_

{'beauty': 144,
 'beast': 140,
 'complete': 372,
 'script': 1556,
 'compiled': 370,
 'ben': 165,
 'scripps': 1555,
 'rqnpq': 1517,
 'cmuvmcsvcmichedu': 353,
 'narrator': 1203,
 'upon': 1954,
 'time': 1874,
 'faraway': 647,
 'land': 1028,
 'young': 2081,
 'prince': 1394,
 'lived': 1076,
 'shining': 1603,
 'castle': 283,
 'although': 46,
 'everything': 609,
 'heart': 879,
 'desired': 471,
 'spoiled': 1735,
 'selfish': 1570,
 'unkind': 1943,
 'one': 1243,
 'winter': 2039,
 'night': 1218,
 'old': 1241,
 'beggar': 154,
 'woman': 2048,
 'came': 265,
 'offered': 1239,
 'single': 1643,
 'rose': 1512,
 'return': 1489,
 'shelter': 1598,
 'bitter': 182,
 'cold': 358,
 'repulsed': 1483,
 'haggard': 846,
 'appearance': 71,
 'sneered': 1682,
 'gift': 785,
 'turned': 1924,
 'away': 103,
 'warned': 1994,
 'deceived': 454,
 'found': 743,
 'within': 2045,
 'dismissed': 504,
 'ugliness': 1932,
 'melted': 1144,
 'reveal': 1490,
 'beautiful': 143,
 'enchantress': 580,
 'tried': 1910,
 'apologize': 69,
 'la

In [27]:
CV.get_feature_names_out()[:50]

array(['aaarrrgghh', 'aah', 'aahchooo', 'aahh', 'aahs', 'ablaze', 'able',
       'absolutely', 'accomplished', 'accurate', 'across', 'act',
       'action', 'actually', 'adding', 'adjourn', 'adjourned', 'admired',
       'admonishing', 'admonishingly', 'advance', 'advancing',
       'adventure', 'affection', 'afraid', 'agree', 'agreement', 'ahead',
       'ahem', 'ahh', 'aim', 'air', 'ajar', 'alarming', 'alive', 'allo',
       'allow', 'allstsarry', 'almost', 'alone', 'along', 'alors',
       'already', 'alschip', 'also', 'alst', 'although', 'always',
       'amazed', 'amazement'], dtype=object)

In [28]:
# Document Term Matrix
pd.DataFrame(count_model.toarray(),columns=CV.get_feature_names_out())

Unnamed: 0,aaarrrgghh,aah,aahchooo,aahh,aahs,ablaze,able,absolutely,accomplished,accurate,...,yell,yelling,yep,yes,yesterday,yet,youll,young,zoom,zut
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7662,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Word 2 Vectorizer Model
## Word Simmilarity

In [29]:
w2v_model=gensim.models.Word2Vec

In [30]:
# innitialize Model
W2V=w2v_model(vector_size=300,sg=1,epochs=10)

In [31]:
# Build Vaocab
W2V.build_vocab([processed_tokens])

In [32]:
# train model
vec_word=W2V.train([processed_tokens],total_examples=W2V.corpus_count,epochs=W2V.epochs)

In [33]:
W2V.wv.most_similar("gaston")

[('lefou', 0.9980762600898743),
 ('crazy', 0.9943729639053345),
 ('man', 0.9943139553070068),
 ('like', 0.9942840933799744),
 ('crony', 0.9935235381126404),
 ('one', 0.9928992390632629),
 ('swing', 0.9924961924552917),
 ('guy', 0.9923769235610962),
 ('old', 0.9922946691513062),
 ('town', 0.9921086430549622)]

In [34]:
W2V.wv.most_similar("beast")

[('lair', 0.9981046319007874),
 ('object', 0.9978859424591064),
 ('int', 0.9978041648864746),
 ('master', 0.9977242946624756),
 ('must', 0.9976661205291748),
 ('light', 0.9974851608276367),
 ('look', 0.997470498085022),
 ('enters', 0.9974050521850586),
 ('still', 0.9973901510238647),
 ('stay', 0.9973811507225037)]

In [35]:
W2V.wv.most_similar("belle")

[('dark', 0.9986955523490906),
 ('away', 0.9986339211463928),
 ('better', 0.9985668659210205),
 ('see', 0.9985582232475281),
 ('ext', 0.9985227584838867),
 ('attack', 0.9985226988792419),
 ('balcony', 0.9985196590423584),
 ('lead', 0.9985153675079346),
 ('follow', 0.9985054135322571),
 ('light', 0.9985002279281616)]