In [20]:
import pandas as pd, numpy as np
import re, os
from nltk.tokenize import WhitespaceTokenizer

Let's read in a sample data file to see what we're dealing with.

In [2]:
df = pd.read_csv("../data/416_TRANSCRIPT.csv", delimiter="\t")

In [3]:
df.head(10)

Unnamed: 0,start_time,stop_time,speaker,value
0,13.4,14.33,Participant,<sync>
1,28.905,50.445,Ellie,IntroV4Confirmation (hi i'm ellie thanks for c...
2,50.72,51.15,Participant,sure
3,51.721,52.181,Ellie,okay_confirm (okay)
4,53.239,55.509,Ellie,how_doingV (so how are you doing today)
5,55.63,56.78,Participant,i'm doing good how are you
6,57.98,59.49,Ellie,great_thanks (i'm great thanks)
7,60.634,61.954,Ellie,where_originally (where are you from originally)
8,62.67,63.32,Participant,chicago
9,64.177,64.767,Ellie,really (really)


As can be seen, this is a transcribed counseling conversation. Let's first join together all strings of consecutive messages.

In [4]:
drop_rows = []
for i in range(1, len(df)):
    # check for consecutive rows with same speaker
    if df.loc[i, "speaker"] == df.loc[i - 1, "speaker"]:
        df.loc[i, "value"] = str(df.loc[i - 1, "value"]) + " " + str(df.loc[i, "value"])
        drop_rows.append(i - 1)
df.drop(drop_rows, inplace=True)
df = df[df.speaker == 'Participant']["value"].apply(lambda x: re.sub("<.*>", "", x))
df = df[df != ""]

In [5]:
df.head(20)

2                                                  sure
5                            i'm doing good how are you
8                                               chicago
10                                                 yeah
12                          about twenty five years ago
14                               yeah it's been a while
16             actually i haven't been back  i like l_a
18                                           i like l_a
20    mm it's a big city so it's kinda similar but t...
23                                 family i was a child
25             it took a little adjustment but not hard
30    um the weather um i can't think of anything ri...
33                                             traffic 
35    not really but i've traveled  traveled a littl...
37    seeing new places different people the way the...
41        or the middle of last year that was very nice
46    um it's a good question i'm not really sure uh...
48    i can't think of anything right off the ba

We've now extracted everything that the user has said. We can apply this process to all of the conversations.

In [6]:
def parse_conversation(df):
    drop_rows = []
    for i in range(1, len(df)):
        # check for consecutive rows with same speaker
        if df.loc[i, "speaker"] == df.loc[i - 1, "speaker"]:
            df.loc[i, "value"] = str(df.loc[i - 1, "value"]) + " " + str(df.loc[i, "value"])
            drop_rows.append(i - 1)
    df = df.drop(drop_rows)
    df = df[df.speaker == 'Participant']["value"].apply(lambda x: re.sub("<.*>", "", x))
    df = df[df != ""]
    return df

In [7]:
convs = []
for file in os.listdir("../data"):
    if file.endswith(".csv"):
        print(file)
        df = pd.read_csv("../data/" + file, delimiter="\t")
        convs.append(parse_conversation(df))

318_TRANSCRIPT.csv
376_TRANSCRIPT.csv
480_TRANSCRIPT.csv
403_TRANSCRIPT.csv
375_TRANSCRIPT.csv
413_TRANSCRIPT.csv
462_TRANSCRIPT.csv
348_TRANSCRIPT.csv
475_TRANSCRIPT.csv
314_TRANSCRIPT.csv
463_TRANSCRIPT.csv
336_TRANSCRIPT.csv
324_TRANSCRIPT.csv
397_TRANSCRIPT.csv
361_TRANSCRIPT.csv
433_TRANSCRIPT.csv
382_TRANSCRIPT.csv
365_TRANSCRIPT.csv
357_TRANSCRIPT.csv
329_TRANSCRIPT.csv
392_TRANSCRIPT.csv
399_TRANSCRIPT.csv
332_TRANSCRIPT.csv
328_TRANSCRIPT.csv
325_TRANSCRIPT.csv
311_TRANSCRIPT.csv
369_TRANSCRIPT.csv
308_TRANSCRIPT.csv
470_TRANSCRIPT.csv
315_TRANSCRIPT.csv
387_TRANSCRIPT.csv
373_TRANSCRIPT.csv
428_TRANSCRIPT.csv
461_TRANSCRIPT.csv
456_TRANSCRIPT.csv
409_TRANSCRIPT.csv
418_TRANSCRIPT.csv
386_TRANSCRIPT.csv
395_TRANSCRIPT.csv
309_TRANSCRIPT.csv
372_TRANSCRIPT.csv
371_TRANSCRIPT.csv
422_TRANSCRIPT.csv
312_TRANSCRIPT.csv
434_TRANSCRIPT.csv
458_TRANSCRIPT.csv
321_TRANSCRIPT.csv
489_TRANSCRIPT.csv
471_TRANSCRIPT.csv
431_TRANSCRIPT.csv
368_TRANSCRIPT.csv
307_TRANSCRIPT.csv
383_TRANSCRI

In [12]:
convs_full = pd.concat(convs).reset_index(drop=True)

In [21]:
tk = WhitespaceTokenizer()
tokenized = []
for conv in convs_full:
    try:
        tokenized.append(["<s>"] + tk.tokenize(conv) + ["</s>"])
    except:
        print(conv)

In [23]:
# Gets all of the ngrams and their counts for the given dataset
def get_ngram_counts(tweets, n):
    counts = {}
    for tweet in tweets:
        for i in range(len(tweet) - n + 1):
            ngram = " ".join(tweet[i:i+n])
            counts[ngram] = counts.get(ngram, 0) + 1
    return counts

In [30]:
counts = get_ngram_counts(tokenized, 4)

In [31]:
ngrams_df = pd.DataFrame(counts.items(), columns=["ngram", "count"])

start_ngrams = ngrams_df[ngrams_df.ngram.str.startswith("<s>")].copy()
start_ngrams["count"] /= start_ngrams["count"].sum()

other_ngrams = ngrams_df[~ngrams_df.ngram.str.startswith("<s>")].copy()

In [32]:
def gen_sentence(start_ngrams, other_ngrams, n, gen_count):
    for i in range(gen_count):
        start = np.random.choice(start_ngrams["ngram"], p=start_ngrams["count"]).split(" ")
        sentence = start

        while sentence[-1] != "</s>":
            curr_choices = other_ngrams[other_ngrams.ngram.str.startswith(" ".join(sentence[-n+1:]) + " ")].copy()
            curr_choices["count"] /= curr_choices["count"].sum()
            curr = np.random.choice(curr_choices["ngram"], p=curr_choices["count"]).split(" ")
            sentence.append(curr[-1])

        print(" ".join(sentence[1:-1]))

In [33]:
gen_sentence(start_ngrams, other_ngrams, 4, 10)

uh i would let me see uh well um i'm trying to get into a uh you might wanna say a a altercation with the rednecks down there because as as you know everything in the world and seeing all the colors in the ocean
philosophy i have my best friend and i drove my car too fast and that that i'm trustworthy
uh business
oh yeah you can always fix what you did in your past otherwise you wouldn't need to get over that so after spending one year in in my life uh my daughter my son my daughter's a great source of pride and um um that's mm that's about it
um in that moment
it's spread out a lot uh not trusting uh suspiciousness of other people i mean probably as somebody who oddly enough somebody who is laid back and chill
i'm doing good
when i'm annoyed
i was i was getting in trouble and i was kind of a difficult decision
uh um probably the one of my weaker points i would say a counselor to some people you know some of them not strong enough and so i found when i got to get together and do things