### 1.
 We want to explore the general trend of the conversation data. For this purpose, create a dataframe for each speaker by gathering all his utterances in a single file.
Then write a script that determines the vocabulary set, vocabulary size, total number of tokens, total number of repetitions of words in the same post, total number of confirmation words (e.g., yes, OK, sure), total number of negation tokens, associated to each speaker. Summarize the result in a table, and then draw a subgraph that shows on the same plot the evolution of number of repetitions, number of negation, number of confirmation-like tokens, with respect to the number of tokens employed for each speaker (You may create some subdivision from the total number of tokens to ensure enough datum are used to represent the graphical illustration). Calculate the overall personality for each speaker by averaging over all instances of the original dataset, and comment on possible similarities and differences between speakers and whether some attributes are more associated with some personality patterns.

In [64]:
# Download dataset from GitHub repository
import pandas as pd
url = "https://raw.githubusercontent.com/preke/PELD/main/data/Dyadic_PELD.tsv"
df = pd.read_csv(url, sep='\t')

In [65]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6510 entries, 0 to 6509
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Speaker_1    6510 non-null   object
 1   Speaker_2    6510 non-null   object
 2   Personality  6510 non-null   object
 3   Utterance_1  6510 non-null   object
 4   Utterance_2  6510 non-null   object
 5   Utterance_3  6510 non-null   object
 6   Emotion_1    6510 non-null   object
 7   Emotion_2    6510 non-null   object
 8   Emotion_3    6510 non-null   object
 9   Sentiment_1  6510 non-null   object
 10  Sentiment_2  6510 non-null   object
 11  Sentiment_3  6510 non-null   object
dtypes: object(12)
memory usage: 610.4+ KB
None


In [66]:
# save each speaker with their utterances in separate files
distinct_speakers1 = df['Speaker_1'].unique()
print('speaker 1: ', distinct_speakers1)
for s in distinct_speakers1:
    lines = (pd.concat([df.loc[df['Speaker_1']==s, 'Utterance_1'],
                        df.loc[df['Speaker_1']==s, 'Utterance_3'],
                        df.loc[df['Speaker_2']==s, 'Utterance_2']])
               .dropna()
               .rename('utterance'))
    lines.to_csv(f'{s}_utterances.csv', index=False)

speaker 1:  ['Chandler' 'Joey' 'Rachel' 'Monica' 'Phoebe' 'Ross']


In [67]:
# load each speaker's utterances with a dataframe from the saved files
df_Chandler = pd.read_csv('Chandler_utterances.csv')
df_Monica = pd.read_csv('Monica_utterances.csv')
df_Ross = pd.read_csv('Ross_utterances.csv')
df_Rachel = pd.read_csv('Rachel_utterances.csv')
df_Joey = pd.read_csv('Joey_utterances.csv')
df_Phoebe = pd.read_csv('Phoebe_utterances.csv')

In [68]:
# generate confirmation words set from seed words
from nltk.corpus import wordnet as wn
seed = {'yes', 'yeah', 'yep', 'sure', 'okay', 'ok', 'indeed', 'exactly', 'right',
        'correct', 'absolutely', 'definitely', 'certainly', 'affirmative'}
confirmation_words = set(seed)
for word in list(seed):
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            confirmation_words.add(lemma.name().replace('_', ' '))

print(sorted(confirmation_words))
confirmation_words = list(confirmation_words)

['O.K.', 'OK', 'Oklahoma', 'Sooner State', 'absolutely', 'adjust', 'affirmative', 'affirmatory', 'all right', 'alright', 'approbative', 'approbatory', 'approve', 'approving', 'aright', 'by all odds', 'castigate', 'certain', 'certainly', 'chasten', 'chastise', 'compensate', 'correct', 'correctly', 'counterbalance', 'dead', 'decent', 'decently', 'decidedly', 'decline', 'definitely', 'discipline', 'emphatically', 'even off', 'even out', 'even up', 'exactly', 'fine', 'flop', 'for certain', 'for sure', 'good', 'hunky-dory', 'in good order', 'in spades', 'incisively', 'indeed', 'indisputable', 'just', 'justly', 'make up', 'mightily', 'mighty', 'o.k.', 'objurgate', 'ok', 'okay', 'okeh', 'okey', 'on the button', 'on the dot', 'on the nose', 'optimistic', 'perfectly', 'plausive', 'powerful', 'precisely', 'proper', 'properly', 'rectify', 'redress', 'right', 'right field', 'right hand', 'right on', 'right wing', 'right-hand', 'rightfield', 'rightfulness', 'ripe', 'sanction', 'set', 'slump', 'so',

In [74]:
#  negation word list
negation_words = {
    "no", "not", "n't", "never", "none", "nothing", "nobody", "nowhere",
    "neither", "nor", "barely", "hardly", "scarcely", "seldom", "little",
    "few", "without", "lack", "lacking", "cannot", "can't", "won't",
    "wouldn't", "shouldn't", "couldn't", "didn't", "doesn't", "don't",
    "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't"
}

In [76]:
# get the vocabulary set and size associated to each speaker.
from lib2to3.pgen2 import token
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r"[A-Za-z']+")

def build_speaker_vocab_info(df, speaker_name):
    utterancesArr  = df['utterance'].str.lower().tolist()
    totalWordRep = 0
    totalVocab = []
    totalTokens = []
    negation_tokens_count = 0
    confirmation_words_in_utterances = []
    for sentence in utterancesArr:
        tokens = tokenizer.tokenize(sentence)
        for token in tokens:
            if token in confirmation_words:                
                confirmation_words_in_utterances.append(token)
            if token in negation_words:
                negation_tokens_count += 1
        vocab = sorted(set(tokens))
        totalVocab.extend(vocab)
        totalTokens.extend(tokens)
        dup = len(tokens) - len(vocab)
        totalWordRep += dup
    return {'tokens': totalTokens, 'vocab': totalVocab, 
            'totalWordRep': totalWordRep, 
            'confirmation_words':confirmation_words_in_utterances,
            'negation_tokens_count': negation_tokens_count}

utterance_features = {}
for speaker_name in distinct_speakers1:
    current_speaker_utterance_df = globals()[f'df_{speaker_name}']
    utterance_features[speaker_name] = build_speaker_vocab_info(current_speaker_utterance_df, speaker_name)
    print(f"""{speaker_name}:
          {len(utterance_features[speaker_name]["tokens"]):,} tokens,
          {len(utterance_features[speaker_name]["vocab"]):,} vocabulary,
          {utterance_features[speaker_name]["totalWordRep"]:,} total repetitions,
          {len(utterance_features[speaker_name]["confirmation_words"]):,} confirmation words,
          {utterance_features[speaker_name]["negation_tokens_count"]:,} negation words.""")

Chandler:
          25,624 tokens,
          23,366 vocabulary,
          2,258 total repetitions,
          1,318 confirmation words,
          853 negation words.
Joey:
          26,148 tokens,
          24,023 vocabulary,
          2,125 total repetitions,
          1,278 confirmation words,
          862 negation words.
Rachel:
          27,155 tokens,
          24,625 vocabulary,
          2,530 total repetitions,
          1,551 confirmation words,
          900 negation words.
Monica:
          23,215 tokens,
          21,479 vocabulary,
          1,736 total repetitions,
          960 confirmation words,
          682 negation words.
Phoebe:
          22,077 tokens,
          20,172 vocabulary,
          1,905 total repetitions,
          1,372 confirmation words,
          751 negation words.
Ross:
          25,296 tokens,
          22,579 vocabulary,
          2,717 total repetitions,
          1,338 confirmation words,
          854 negation words.
