In [None]:
# Data Analysis and Preprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For File Navigation
import os

# Our NLP Library which includes Word2Vec algorithm
import gensim

# For tokenizing corpus into sentences
import nltk
from nltk.corpus import stopwords

# For tokenizing the sentences into words, lowercase them and remove punctuation marks
from gensim.utils import simple_preprocess
# For removing stopwords
from gensim.parsing.preprocessing import remove_stopwords

# Generate WordCloud
from wordcloud import WordCloud

# PCA
from sklearn.decomposition import PCA

# Advanced Visualization for word vectors
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

import string

In [None]:
df = pd.read_csv('Harry_Potter_Movies/Dialogue.csv', encoding='ISO-8859-1')
df

In [None]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation," ")
    return text

df["Tokens"] = df["Dialogue"].str.lower().apply(remove_punctuations).apply(nltk.word_tokenize)
df["Tokens"]
df

dialoge = df["Tokens"].tolist()
filtered_tokens = []
for line in dialoge:
    line_list = []
    for token in line:
        if token.lower() not in stopwords.words('english') and token not in string.punctuation:
            line_list.append(token)

    filtered_tokens.append(line_list)

df["Tokens"] = filtered_tokens
df

In [None]:
story = []

corpus = df['Dialogue'].tolist()
raw = ''.join(corpus)
raw_sent = nltk.sent_tokenize(raw)
for sent in raw_sent:
        # 3 - Removal of stopwords
        sent = remove_stopwords(sent)
        
        # 4 - Removal of punctuation marks 
        # 5 - Tokenization of sentences to words
        story.append(simple_preprocess(sent))
story = [x for x in story if x]

In [None]:
story

In [None]:
story_concat = []
token_join = []

for token in story:
    tokenjoin = ' '.join(token)
    token_join.append(tokenjoin)
storyjoin = ' '.join(token_join)
story_concat.append(storyjoin)

story_concat

In [None]:
model = gensim.models.Word2Vec(
    vector_size=1000,
    window=5,  
    min_count=3,
    workers=4,
    sg=1,
)
model.build_vocab(filtered_tokens, progress_per=1000)
model.train(filtered_tokens, total_examples=model.corpus_count, epochs=25)

In [None]:
model.wv.most_similar('snape')

In [None]:
model.wv.doesnt_match(['dumbledore', 'ron', 'arthur', 'fred', 'george', 'ginny'])

In [None]:
model.wv.doesnt_match(['harry', 'ron', 'hermione', 'malfoy'])

In [None]:
model.wv.doesnt_match(['gryffindor', 'slytherin', 'hufflepuff', 'ravenclaw', 'voldemort'])

In [None]:
model.wv.similarity('ginny', 'ron')

In [None]:
model.wv.similarity('hermione', 'ron')

In [None]:
model.wv.similarity('voldemort', 'ron')

In [None]:
model.wv.get_normed_vectors().shape

In [None]:
# Amount of unique words
len(model.wv.index_to_key)

In [None]:
import gensim
import os
import re
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim.models.doc2vec import TaggedDocument

In [None]:
df_characters = pd.read_csv("Harry_Potter_Movies/Characters.csv", encoding='ISO-8859-1')
df_chardialogues = pd.merge(df, df_characters, on='Character ID', how="left")
df_dialHarry = df_chardialogues[df_chardialogues['Character ID'] == 1]
dialHarry = df_dialHarry['Tokens'].tolist()
df_dialRon = df_chardialogues[df_chardialogues['Character ID'] == 2]
df_dialHerm = df_chardialogues[df_chardialogues['Character ID'] == 3]
df_dialDumbl = df_chardialogues[df_chardialogues['Character ID'] == 4]
df_dialHagr = df_chardialogues[df_chardialogues['Character ID'] == 5]
df_dialSnape = df_chardialogues[df_chardialogues['Character ID'] == 6]
df_dialVold = df_chardialogues[df_chardialogues['Character ID'] == 9]

df_characters_list = [df_dialHarry,df_dialRon,df_dialHerm,df_dialDumbl,df_dialHagr,df_dialSnape,df_dialVold]

In [None]:
dialHarry

In [None]:
doc = []
token = []
list = []
for line in dialHarry:
    if line.__class__ == list:
        list = ''.join(line)
        token = ' '.join(list)
        doc.append(token)
    else:
        token = ' '.join(line)
        doc.append(token)

In [None]:
document = []
yes = []
no = []
for line in doc:
    yes = ''.join(line)
    print(yes)
    no.append(yes)
    print(no)
document.append(no)

In [None]:
story_concat = []
token_join = []

for token in doc:
    tokenjoin = ''.join(token)
    token_join.append(tokenjoin)
storyjoin = ' '.join(token_join)
story_concat.append(storyjoin)

story_concat

In [None]:
story_concat[0]

In [None]:
documents = [TaggedDocument(words=story_concat[0], tags=[0])]

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()

taggeddoc = []
 
texts = []
for index,i in enumerate(df_characters_list):
    # for tagged doc
    wordslist = []
    tagslist = []

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # remove numbers
    number_tokens = [re.sub(r'[\d]', ' ', i) for i in stopped_tokens]
    number_tokens = ' '.join(number_tokens).split()

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
    # remove empty
    length_tokens = [i for i in stemmed_tokens if len(i) > 1]
    # add tokens to list
    texts.append(length_tokens)

    td = TaggedDocument(gensim.utils.to_unicode(str.encode(' '.join(stemmed_tokens))).split(),str(index))
    # for later versions, you may want to use: td = TaggedDocument(gensim.utils.to_unicode(str.encode(' '.join(stemmed_tokens))).split(),[str(index)])
    taggeddoc.append(td)

In [None]:
model = gensim.models.Doc2Vec(documents)

In [None]:
model.build_vocab(documents)

In [None]:
model.train(documents, total_examples=model.corpus_count, epochs=25)

In [None]:
model['aunt']