In [1]:
import os
import sys
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import gpt_2_simple as gpt2
import random
import pandas as pd
import numpy as np
import math
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import logging
logging.getLogger('tensorflow').setLevel(logging.FATAL)
import contextlib
import re

sys.path.append("../lib/InferSent")
from models import InferSent
import nltk
nltk.download('punkt')
import spacy
import torch

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



[nltk_data] Downloading package punkt to /home/ryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load('../models/encoder/infersent2.pkl'))
infersent.set_w2v_path("../models/fastText/crawl-300d-2M.vec")

nlp = spacy.load("en")
squad_df = pd.read_csv("../corpora/squad-dev-v2.0.csv", index_col=0)

sentences = []

contexts = list(squad_df["contexts"].drop_duplicates())
for context in contexts:
    doc = nlp(context)
    sentences += [sentence.string.strip() for sentence in doc.sents]
    
infersent.build_vocab(sentences, tokenize=True)

Found 18481(/19809) words with w2v vectors
Vocab size : 18481


In [3]:
def cosine_similarity(text1, text2):
    return np.dot(text1, text2)/(np.linalg.norm(text1) * np.linalg.norm(text2))

In [4]:
def split_sentences(paragraph):
    doc = nlp(paragraph)
    return [sentence.string.strip() for sentence in doc.sents]

In [5]:
def cosine_predict(statements, question):
    context_sentences = split_sentences(statements)
    most_similar, highest_sim = "", 0
    for sentence in context_sentences:
        similarity = cosine_similarity(infersent.encode([question])[0], infersent.encode([sentence])[0])
        if similarity > highest_sim:
            most_similar = sentence
            highest_sim = similarity
    return most_similar, highest_sim

In [30]:
profiles = {"default": {}}
profile_dir = "../corpora/profiles"
for profile in os.listdir(profile_dir):
    df = pd.read_csv(f"{profile_dir}/{profile}")
    character = df.columns.tolist()[0]
    profiles[character] = {}
    for emotion in df.columns.tolist()[1:]:
        profiles["default"][emotion] = 1 / len(df.columns.tolist()[1:])
        profiles[character][emotion] = df[emotion].tolist()[0]

In [7]:
def get_emotional_composite(emotional_profile, response_length):
    composite_amount = random.randint(1,3)
    emotions, probabilities = zip(*emotional_profile.items())
    response_breakdown = [(str(emotion), math.floor(response_length/composite_amount) + int(i < response_length % composite_amount)) for i, emotion in enumerate(np.random.choice(emotions, composite_amount, p=probabilities))]
    return response_breakdown

In [21]:
def generate_composite_response(sess, emotional_profile, conversation, character, response_length=30, scene=[]):
    response = "\n".join([f"{sentence[0]}: {sentence[1]}" for sentence in conversation]) + f"\nscene:{', '.join(scene)}\n{character}:"
    start_offset = len(response)
    response_breakdown = get_emotional_composite(emotional_profile, response_length)
    for emotion, length in response_breakdown:
        gpt2.reset_session(sess)
        sess = gpt2.start_tf_sess()
        gpt2.load_gpt2(sess, run_name=f"{emotion}_run1")
        response = gpt2.generate(
            sess,
            length=length,
            temperature=0.7,
            prefix=response,
            nsamples=1,
            batch_size=1,
            run_name=f"{emotion}_run1",
            return_as_list=True
        )[0]
    return re.split(r"[a-z A-Z0-9]+:", response[start_offset:])[0].strip().split("\n")[0]

In [22]:
def generate_character_response(sess, profile, conversation, character, response_length=30, scene=[]):
    seed = "\n".join([f"{sentence[0]}: {sentence[1]}" for sentence in conversation]) + f"\nscene:{', '.join(scene)}\n{character}:"
    gpt2.reset_session(sess)
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess, run_name=f"{character}_run1")
    response = gpt2.generate(
        sess,
        length=response_length,
        temperature=0.7,
        prefix=seed,
        nsamples=1,
        batch_size=1,
        run_name=f"{character}_run1",
        return_as_list=True
    )[0][len(seed):]
    return re.split(r"[a-z A-Z0-9]+:", response)[0].strip().split("\n")[0]

In [24]:
def generate_holistic_model_response(sess, profile, conversation, character, response_length=30, scene=[]):
    seed = "\n".join([f"{sentence[0]}: {sentence[1]}" for sentence in conversation]) + f"\nscene:{', '.join(scene)}\n{character}:"
    gpt2.reset_session(sess)
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess, run_name=f"full_model_run1")
    response = gpt2.generate(
        sess,
        length=response_length,
        temperature=0.7,
        prefix=seed,
        nsamples=1,
        batch_size=1,
        run_name=f"full_model_run1",
        return_as_list=True
    )[0][len(seed):]
    return re.split(r"[a-z A-Z0-9]+:", response)[0].strip().split("\n")[0]

In [74]:
def start_conversation(
    conversation=[], 
    scene=["harry", "user", "environment"], 
    characters=["harry", "ron", "hermione", "snape", "albus dumbledore", "tom riddle", "hagrid", "user", "environment"],
    character_addition_prob=0.2,
    character_removal_prob=0.25,
    env_model=generate_character_response, 
    char_model=generate_composite_response, 
    length=10,
    print_scene=False,
    profiles=profiles,
    end_on_empty=True,
    write_to_file=False,
    file_name="conversation.txt"
):
    print("Scene:")
    [print(f"\t{character}: {statement}") for character, statement in conversation]
    print("")
    sess = gpt2.start_tf_sess()
    with open(f"output/{file_name}", "w", buffering=1) as output_writer:
        [output_writer.write(f"\t{character}: {statement}") for character, statement in conversation]
        for i in range(length):
            if i and random.random() < character_removal_prob:
                del scene[scene.index(random.choice(scene))]
            if i and random.random() < character_addition_prob:
                scene.append(random.choice([character for character in characters if character not in scene]))
            if not scene:
                if end_on_empty:
                    break
                else:
                    scene = [random.choice(characters)]
            with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
                character = random.choice([character for character in scene if len(scene) < 2 or not conversation or character != conversation[-1][0]])
                if character == "user":
                    print("user: ", end="")
                    response = input("user: ")
                elif character == "environment":
                    response = env_model(sess, profiles[character] if character in profiles else profiles["default"], conversation[-30:], "environment", scene=scene)
                else:
                    response = char_model(sess, profiles[character] if character in profiles else profiles["default"], conversation[-30:], character, scene=scene)
            if character != "user":
                print(f"{character.capitalize()}: {response}{' - ' + str(scene) if print_scene else ''}")
            conversation.append((character, response))
            if write_to_file:
                output_writer.write(f"{character.capitalize()}: {response}\n")
            if len(conversation) > 5:
                predicted_response, _ = cosine_predict("\n".join([statement for _, statement in conversation[:-1]]), response)
                conversation.append((character, predicted_response))

In [75]:
scene = [
    ("environment", "Diagon Alley was completely empty because of the coronavirus."),
    ("hagrid", "Where is everybody?"),
    ("albus dumbledore", "Hopefully at home."),
]

In [None]:
start_conversation(
    conversation=scene, 
    scene=["harry", "ron", "hermione", "harry_as_voldemort", "environment"], 
    characters=["harry", "ron", "hermione", "snape", "albus dumbledore", "tom riddle", "hagrid", "harry_as_voldemort", "environment"],
    character_addition_prob=0.2,
    character_removal_prob=0.15,
    env_model=generate_holistic_model_response, 
    char_model=generate_holistic_model_response, 
    length=200,
    print_scene=True,
    profiles=profiles,
    end_on_empty=False,
    write_to_file=True,
    file_name="conversation_200.txt"
)

Scene:
	environment: Diagon Alley was completely empty because of the coronavirus.
	hagrid: Where is everybody?
	albus dumbledore: Hopefully at home.

Environment: Harry washes his face with a towel. - ['harry', 'ron', 'hermione', 'harry_as_voldemort', 'environment']
Hermione: There's no sign of them. There's just... silence. - ['harry', 'ron', 'hermione', 'harry_as_voldemort', 'environment']
Environment: They were walking through fog and shadows. - ['harry', 'ron', 'hermione', 'harry_as_voldemort', 'environment']
Harry: We've been roundly bitten, haven't we? - ['harry', 'ron', 'hermione', 'environment']
Environment: They stop as they see someone walking around the corner. - ['harry', 'ron', 'hermione', 'environment', 'hagrid']
Harry: That's not good. - ['harry', 'ron', 'hermione', 'environment', 'hagrid']
Ron: That's not good. I can't believe they're going to kill harry. - ['harry', 'ron', 'hermione', 'hagrid']
Harry: That's not good. I can't believe they're going to kill harry. - ['h

In [None]:
scene = [
    ("environment", "Diagon Alley was completely empty because of the coronavirus."),
    ("hagrid", "Where is everybody?"),
    ("albus dumbledore", "Hopefully at home."),
]

start_conversation(
    conversation=scene, 
    scene=["harry", "ron", "hermione", "hagrid", "albus dumbledore", "environment"], 
    characters=["harry", "ron", "hermione", "snape", "albus dumbledore", "tom riddle", "hagrid", "harry_as_voldemort", "environment"],
    character_addition_prob=0.21,
    character_removal_prob=0.15,
    env_model=generate_holistic_model_response, 
    char_model=generate_character_response, 
    length=200,
    print_scene=True,
    profiles=profiles,
    end_on_empty=False,
    write_to_file=True,
    file_name="conversation_200_single_char.txt"
)

with open("output/conversation_200_single_char_full_debug.txt", "w") as debug_writer:
    for speaker, statement in scene:
        debug_writer.write(f"{speaker}: {statement}\n")

In [None]:
def clean_text(content):
    content = content.lower().strip()
    content = re.sub(r"[^a-zA-Z]", " ", str(content))
    content = re.sub(r"[\s\t\n]+", " ", content)
    tokens = [word for word in content.split() if word and word not in stopwords.words("english")]
    cleaned_text = " ".join(tokens)
    return cleaned_text

isear_df = pd.read_csv("../corpora/isear.csv", header=None)
isear_df.columns = ["emotion", "text", ""]
isear_df = isear_df.drop([""], axis=1)
cleaned_text = [clean_text(text) for text in isear_df["text"].tolist()]

In [None]:
count_vectorizer = CountVectorizer()
training_counts = count_vectorizer.fit_transform(cleaned_text)
bag_of_words = count_vectorizer.transform(cleaned_text)
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(bag_of_words)
log_regression = Pipeline([
        ('vect', count_vectorizer), 
        ('tfidf', tfidf_transformer),
        ('clf', SGDClassifier(loss="log", 
                              penalty='l1',
                              random_state=1
                             ))
    ])

In [None]:
def get_class_probs(classifier, data):
    prob_spread = classifier.predict_proba([data]).tolist()[0] if hasattr(classifier, "predict_proba") else None
    probabilities = pd.DataFrame({"class": classifier.classes_, "probability": prob_spread})
    probabilities = probabilities.sort_values(by="probability", ascending=False)
    probabilities = probabilities.set_index("class").T.reset_index()
    del probabilities["index"]
    probabilities["predicted"] = log_regression.predict([data])
    return probabilities

In [None]:
scene_df = pd.read_csv("output/conversation_200.txt", sep=": ", columns=["character", "statement"])
generated_character_profiles = {}
for character in ["harry", "ron", "hermione", "snape", "albus dumbledore", "tom riddle", "hagrid", "harry_as_voldemort"]:
    emotion_breakdown = pd.concat([get_class_probs(log_regression, row.statement) for i, row in scene_df[scene_df.character == character].iterrows()], sort=False)
    generated_character_profiles[character] = pd.DataFrame(emotion_breakdown.mean()).T
    generated_character_profiles[character].index.name = character
    generated_character_profiles[character] = generated_character_profiles[character].T.rename(columns={0:"percent"}).sort_values(by="percent", ascending=False)
display_dataframes_inline(*[df for _, df in character_profiles.items()])

In [None]:
expected_profiles = {}
profile_dir = "../corpora/profiles"
for profile in os.listdir(profile_dir):
    expected_profiles[profile[:-4]] = pd.read_csv(f"{profile_dir}/{profile}")
display_dataframes_inline(*[df for _, df in character_profiles.items()])