In [122]:
import os
import sys
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import gpt_2_simple as gpt2
import random
import pandas as pd
import numpy as np
import math
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import logging
logging.getLogger('tensorflow').setLevel(logging.FATAL)
import contextlib
import re

from IPython.display import display_html

sys.path.append("../lib/InferSent")
from models import InferSent
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
import spacy
import torch
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

[nltk_data] Downloading package punkt to /home/ryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load('../models/encoder/infersent2.pkl'))
infersent.set_w2v_path("../models/fastText/crawl-300d-2M.vec")

nlp = spacy.load("en")
squad_df = pd.read_csv("../corpora/squad-dev-v2.0.csv", index_col=0)

sentences = []

contexts = list(squad_df["contexts"].drop_duplicates())
for context in contexts:
    doc = nlp(context)
    sentences += [sentence.string.strip() for sentence in doc.sents]
    
infersent.build_vocab(sentences, tokenize=True)

Found 18481(/19809) words with w2v vectors
Vocab size : 18481


In [3]:
def cosine_similarity(text1, text2):
    return np.dot(text1, text2)/(np.linalg.norm(text1) * np.linalg.norm(text2))

In [4]:
def split_sentences(paragraph):
    doc = nlp(paragraph)
    return [sentence.string.strip() for sentence in doc.sents]

In [5]:
def cosine_predict(statements, question):
    context_sentences = split_sentences(statements)
    most_similar, highest_sim = "", 0
    for sentence in context_sentences:
        similarity = cosine_similarity(infersent.encode([question])[0], infersent.encode([sentence])[0])
        if similarity > highest_sim:
            most_similar = sentence
            highest_sim = similarity
    return most_similar, highest_sim

In [30]:
profiles = {"default": {}}
profile_dir = "../corpora/profiles"
for profile in os.listdir(profile_dir):
    df = pd.read_csv(f"{profile_dir}/{profile}")
    character = df.columns.tolist()[0]
    profiles[character] = {}
    for emotion in df.columns.tolist()[1:]:
        profiles["default"][emotion] = 1 / len(df.columns.tolist()[1:])
        profiles[character][emotion] = df[emotion].tolist()[0]

In [7]:
def get_emotional_composite(emotional_profile, response_length):
    composite_amount = random.randint(1,3)
    emotions, probabilities = zip(*emotional_profile.items())
    response_breakdown = [(str(emotion), math.floor(response_length/composite_amount) + int(i < response_length % composite_amount)) for i, emotion in enumerate(np.random.choice(emotions, composite_amount, p=probabilities))]
    return response_breakdown

In [21]:
def generate_composite_response(sess, emotional_profile, conversation, character, response_length=30, scene=[]):
    response = "\n".join([f"{sentence[0]}: {sentence[1]}" for sentence in conversation]) + f"\nscene:{', '.join(scene)}\n{character}:"
    start_offset = len(response)
    response_breakdown = get_emotional_composite(emotional_profile, response_length)
    for emotion, length in response_breakdown:
        gpt2.reset_session(sess)
        sess = gpt2.start_tf_sess()
        gpt2.load_gpt2(sess, run_name=f"{emotion}_run1")
        response = gpt2.generate(
            sess,
            length=length,
            temperature=0.7,
            prefix=response,
            nsamples=1,
            batch_size=1,
            run_name=f"{emotion}_run1",
            return_as_list=True
        )[0]
    return re.split(r"[a-z A-Z0-9]+:", response[start_offset:])[0].strip().split("\n")[0]

In [22]:
def generate_character_response(sess, profile, conversation, character, response_length=30, scene=[]):
    seed = "\n".join([f"{sentence[0]}: {sentence[1]}" for sentence in conversation]) + f"\nscene:{', '.join(scene)}\n{character}:"
    gpt2.reset_session(sess)
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess, run_name=f"{character}_run1")
    response = gpt2.generate(
        sess,
        length=response_length,
        temperature=0.7,
        prefix=seed,
        nsamples=1,
        batch_size=1,
        run_name=f"{character}_run1",
        return_as_list=True
    )[0][len(seed):]
    return re.split(r"[a-z A-Z0-9]+:", response)[0].strip().split("\n")[0]

In [24]:
def generate_holistic_model_response(sess, profile, conversation, character, response_length=30, scene=[]):
    seed = "\n".join([f"{sentence[0]}: {sentence[1]}" for sentence in conversation]) + f"\nscene:{', '.join(scene)}\n{character}:"
    gpt2.reset_session(sess)
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess, run_name=f"full_model_run1")
    response = gpt2.generate(
        sess,
        length=response_length,
        temperature=0.7,
        prefix=seed,
        nsamples=1,
        batch_size=1,
        run_name=f"full_model_run1",
        return_as_list=True
    )[0][len(seed):]
    return re.split(r"[a-z A-Z0-9]+:", response)[0].strip().split("\n")[0]

In [89]:
def start_conversation(
    conversation=[], 
    scene=["harry", "user", "environment"], 
    characters=["harry", "ron", "hermione", "snape", "albus dumbledore", "tom riddle", "hagrid", "user", "environment"],
    character_addition_prob=0.2,
    character_removal_prob=0.25,
    env_model=generate_character_response, 
    char_model=generate_composite_response, 
    length=10,
    print_scene=False,
    profiles=profiles,
    end_on_empty=True,
    write_to_file=False,
    file_name="conversation.txt"
):
    print("Scene:")
    [print(f"\t{character}: {statement}") for character, statement in conversation]
    print("")
    sess = gpt2.start_tf_sess()
    with open(f"output/{file_name}", "w", buffering=1) as output_writer:
        [output_writer.write(f"\t{character}: {statement}") for character, statement in conversation]
        for i in range(length):
            if i and random.random() < character_removal_prob and len(scene):
                del scene[scene.index(random.choice(scene))]
            if i and random.random() < character_addition_prob and len(scene) != len(characters):
                scene.append(random.choice([character for character in characters if character not in scene]))
            if not scene and end_on_empty:
                break
            elif not scene and not end_on_empty:
                scene = [random.choice(characters)]
            with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
                character = random.choice([character for character in scene if len(scene) < 2 or not conversation or character != conversation[-1][0]])
                if character == "user":
                    print("user: ", end="")
                    response = input("user: ")
                elif character == "environment":
                    response = env_model(sess, profiles[character] if character in profiles else profiles["default"], conversation[-30:], "environment", scene=scene)
                else:
                    response = char_model(sess, profiles[character] if character in profiles else profiles["default"], conversation[-30:], character, scene=scene)
            if character != "user":
                print(f"{character.capitalize()}: {response}{' - ' + str(scene) if print_scene else ''}")
            conversation.append((character, response))
            if write_to_file:
                output_writer.write(f"{character.capitalize()}: {response}\n")
            if len(conversation) > 5:
                predicted_response, _ = cosine_predict("\n".join([statement for _, statement in conversation[:-1]]), response)
                conversation.append((character, predicted_response))

In [90]:
scene = [
    ("environment", "Diagon Alley was completely empty because of the coronavirus."),
    ("hagrid", "Where is everybody?"),
    ("albus dumbledore", "Hopefully at home."),
]

In [91]:
start_conversation(
    conversation=scene, 
    scene=["harry", "ron", "hermione", "harry_as_voldemort", "environment"], 
    characters=["harry", "ron", "hermione", "snape", "albus dumbledore", "tom riddle", "hagrid", "harry_as_voldemort", "environment"],
    character_addition_prob=0.2,
    character_removal_prob=0.15,
    env_model=generate_holistic_model_response, 
    char_model=generate_holistic_model_response, 
    length=100,
    print_scene=True,
    profiles=profiles,
    end_on_empty=False,
    write_to_file=True,
    file_name="conversation_100.txt"
)

Scene:
	environment: Diagon Alley was completely empty because of the coronavirus.
	hagrid: Where is everybody?
	albus dumbledore: Hopefully at home.

Ron: They're not home. They're going to school. - ['harry', 'ron', 'hermione', 'harry_as_voldemort', 'environment']
Hermione: (rising from her desk) I've got a few things I've got to take care of. - ['harry', 'ron', 'hermione', 'harry_as_voldemort', 'environment']
Environment: They're walking down an empty street in the night. They pass a statue of a hunched over a sickly grey form. it reads MAJ - ['harry', 'ron', 'hermione', 'harry_as_voldemort', 'environment']
Harry: , iv. voldemort. - ['harry', 'hermione', 'harry_as_voldemort', 'environment']
Harry_as_voldemort: Diagon Alley. - ['harry', 'hermione', 'harry_as_voldemort', 'environment']
Hermione: (rising from her desk) I've got a few things I've got to take care of. - ['harry', 'hermione', 'harry_as_voldemort', 'environment']
Environment: _Across the street_, inside a shop. the boy, TO

Ron: yes. - ['harry', 'snape', 'ron', 'harry_as_voldemort', 'hermione', 'tom riddle']
Tom riddle: i don't know how this house came to be, but i'm going to expect it to act as i see fit. - ['harry', 'snape', 'ron', 'harry_as_voldemort', 'hermione', 'tom riddle']
Snape: you may recall, prior to the start of term, i did express my concerns when you appointed professor -- - ['harry', 'snape', 'ron', 'hermione', 'tom riddle', 'albus dumbledore']
Tom riddle: you go to bed. harry paces. looks around. hears laughter. breaks off a cookie, hands it to snape. - ['harry', 'snape', 'ron', 'hermione', 'tom riddle', 'albus dumbledore']
Ron: good to see you're having fun. - ['harry', 'snape', 'ron', 'hermione', 'tom riddle', 'albus dumbledore']
Snape: mr. potter. we have information regarding you that requires your immediate attention. as a matter of policy, the castle's grounds are not to - ['harry', 'snape', 'ron', 'hermione', 'albus dumbledore', 'harry_as_voldemort']
Harry: sirius. sirius. - ['harr

In [92]:
with open("output/conversation_100_holistic_full_debug.txt", "w") as debug_writer:
    for speaker, statement in scene:
        debug_writer.write(f"{speaker}: {statement}\n")

In [85]:
scene = [
    ("environment", "Diagon Alley was completely empty because of the coronavirus."),
    ("hagrid", "Where is everybody?"),
    ("albus dumbledore", "Hopefully at home."),
]

start_conversation(
    conversation=scene, 
    scene=["harry", "ron", "hermione", "hagrid", "albus dumbledore", "environment"], 
    characters=["harry", "ron", "hermione", "snape", "albus dumbledore", "tom riddle", "hagrid", "harry_as_voldemort", "environment"],
    character_addition_prob=0.21,
    character_removal_prob=0.15,
    env_model=generate_holistic_model_response, 
    char_model=generate_character_response, 
    length=100,
    print_scene=True,
    profiles=profiles,
    end_on_empty=False,
    write_to_file=True,
    file_name="conversation_100_single_char.txt"
)

with open("output/conversation_100_single_char_full_debug.txt", "w") as debug_writer:
    for speaker, statement in scene:
        debug_writer.write(f"{speaker}: {statement}\n")

Scene:
	environment: Diagon Alley was completely empty because of the coronavirus.
	hagrid: Where is everybody?
	albus dumbledore: Hopefully at home.

Ron: Everyone's safe. - ['ron']
Tom riddle: I'm afraid I can't do that, Harry. You see, as poor Ginny grows weaker... I grow stronger. - ['tom riddle']
Harry: But if she grows stronger, then... I mean, what was the point of sending her in there? - ['harry']
Harry_as_voldemort: The train leaves at exactly eleven o'clock. We've missed it. - ['harry_as_voldemort']
Snape: That's what the dog's guarding. That's what Snape wants. - ['snape']
Environment: Harry, Hermione and Ron are walking through crowded halls. They approach a door, which opens and closes. - ['environment']
Ron: Then it'll just be you and the dog fighting? - ['ron']
Tom riddle: Yes, Harry, it was Tom Riddle who opened the Chamber of Secrets. - ['tom riddle']
Tom riddle: She'll never be gone! Not as long as those who remain are loyal to me! Suddenly, MUSIC... EERIE and UNEARTH

FileNotFoundError: [Errno 2] No such file or directory: 'checkpoint/hagrid_run1/hparams.json'

In [123]:
def clean_text(content):
    content = content.lower().strip()
    content = re.sub(r"[^a-zA-Z]", " ", str(content))
    content = re.sub(r"[\s\t\n]+", " ", content)
    tokens = [word for word in content.split() if word and word not in stopwords.words("english")]
    cleaned_text = " ".join(tokens)
    return cleaned_text

isear_df = pd.read_csv("../corpora/isear.csv", header=None)
isear_df.columns = ["emotion", "text", ""]
isear_df = isear_df.drop([""], axis=1)
cleaned_text = [clean_text(text) for text in isear_df["text"].tolist()]

X_train, X_test, Y_train, Y_test = train_test_split(cleaned_text, isear_df["emotion"].tolist(), test_size=0.3, random_state=1)

In [125]:
count_vectorizer = CountVectorizer()
training_counts = count_vectorizer.fit_transform(cleaned_text)
bag_of_words = count_vectorizer.transform(cleaned_text)
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(bag_of_words)
log_regression = Pipeline([
        ('vect', count_vectorizer), 
        ('tfidf', tfidf_transformer),
        ('clf', SGDClassifier(loss="log", 
                              penalty='l1',
                              random_state=1
                             ))
    ])

log_regression.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='log',
                       

In [126]:
def get_class_probs(classifier, data):
    prob_spread = classifier.predict_proba([data]).tolist()[0] if hasattr(classifier, "predict_proba") else None
    probabilities = pd.DataFrame({"class": classifier.classes_, "probability": prob_spread})
    probabilities = probabilities.sort_values(by="probability", ascending=False)
    probabilities = probabilities.set_index("class").T.reset_index()
    del probabilities["index"]
    probabilities["predicted"] = log_regression.predict([data])
    return probabilities

In [127]:
def display_dataframes_inline(*dataframes):
    inline_df = "".join([df.to_html() for df in dataframes]).replace("table", "table style='display: inline'")
    display_html(inline_df, raw=True)

In [170]:
scene_df = pd.read_csv("output/conversation_200.txt", sep=": ", names=["character", "statement"])
generated_character_profiles = {}
character_list = ["harry", "ron", "hermione", "snape", "albus dumbledore", "tom riddle", "hagrid", "harry_as_voldemort"]
for character in character_list:
    emotion_breakdown = pd.concat([get_class_probs(log_regression, row.statement) for i, row in scene_df[scene_df.character == character.capitalize()].iterrows()], sort=False)
    generated_character_profiles[character] = pd.DataFrame(emotion_breakdown.mean()).T
    generated_character_profiles[character].index.name = character
    generated_character_profiles[character] = generated_character_profiles[character].T.rename(columns={0:"percent"}).sort_values(by="percent", ascending=False)
display_dataframes_inline(*[df for _, df in generated_character_profiles.items()])

  """Entry point for launching an IPython kernel.


harry,percent
joy,0.185362
fear,0.167128
shame,0.157771
guilt,0.134952
disgust,0.129723
anger,0.125723
sadness,0.099341

ron,percent
disgust,0.182333
anger,0.161311
guilt,0.157838
joy,0.146487
shame,0.131529
fear,0.118992
sadness,0.101512

hermione,percent
guilt,0.202032
joy,0.160415
anger,0.152382
disgust,0.137001
shame,0.136766
sadness,0.12301
fear,0.088394

snape,percent
shame,0.240175
fear,0.181152
anger,0.144979
disgust,0.124236
guilt,0.121135
sadness,0.099022
joy,0.089301

albus dumbledore,percent
shame,0.169533
anger,0.160235
fear,0.157863
joy,0.138714
disgust,0.13556
guilt,0.134512
sadness,0.103582

tom riddle,percent
shame,0.179887
fear,0.176536
anger,0.166075
joy,0.15005
guilt,0.133627
disgust,0.113813
sadness,0.080012

hagrid,percent
fear,0.20233
shame,0.166199
anger,0.149175
joy,0.132464
disgust,0.129813
guilt,0.123038
sadness,0.09698

harry_as_voldemort,percent
anger,0.167776
fear,0.166763
disgust,0.164662
shame,0.148685
guilt,0.133939
joy,0.124216
sadness,0.09396


In [147]:
expected_profiles = {}
profile_dir = "../corpora/profiles"
for profile in os.listdir(profile_dir):
    expected_profiles[profile[:-4]] = pd.read_csv(f"{profile_dir}/{profile}").T
    header = expected_profiles[profile[:-4]].iloc[0]
    expected_profiles[profile[:-4]] = expected_profiles[profile[:-4]][1:]
    expected_profiles[profile[:-4]].columns = header
display_dataframes_inline(*[df for _, df in expected_profiles.items()])

tom riddle,percent
fear,0.219053
anger,0.158987
shame,0.140221
disgust,0.133587
guilt,0.133363
joy,0.114099
sadness,0.10069

harry,percent
disgust,0.1648
anger,0.156473
fear,0.155551
guilt,0.14739
shame,0.147204
joy,0.120979
sadness,0.107603

environment,percent
fear,0.186211
disgust,0.182388
anger,0.155723
shame,0.14614
guilt,0.13493
joy,0.104587
sadness,0.0900213

hagrid,percent
shame,0.157567
disgust,0.156008
anger,0.150552
guilt,0.148776
fear,0.142127
joy,0.139677
sadness,0.105294

albus dumbledore,percent
fear,0.172791
anger,0.164309
disgust,0.157167
shame,0.14895
guilt,0.145949
joy,0.116367
sadness,0.0944675

ron,percent
disgust,0.169034
anger,0.157182
fear,0.155339
shame,0.148929
guilt,0.147232
joy,0.119162
sadness,0.103121

hermione,percent
disgust,0.161953
fear,0.160636
shame,0.157294
anger,0.149277
guilt,0.138922
joy,0.133628
sadness,0.0982899

snape,percent
disgust,0.162403
anger,0.158784
fear,0.155455
shame,0.154314
guilt,0.153915
joy,0.11965
sadness,0.0954785


In [168]:
def character_similarity(character_profile_1, character_profile_2):
    return cosine_similarity(character_profile_1.sort_index().T.values[0].tolist(), character_profile_2.sort_index().T.values[0].tolist())

In [183]:
similarity_list = []
for character in character_list:
    expected_character = character[character.rfind("_") + 1 if character.find("_") > 0 else 0:].replace("voldemort", "tom riddle")
    similarity_list.append(character_similarity(generated_character_profiles[character], expected_profiles[expected_character]))

pd.DataFrame({"character": character_list, "book_similarity": similarity_list})

Unnamed: 0,character,book_similarity
0,harry,0.976888
1,ron,0.99088
2,hermione,0.960975
3,snape,0.962648
4,albus dumbledore,0.993733
5,tom riddle,0.981684
6,hagrid,0.982585
7,harry_as_voldemort,0.98674
