In [4]:
from team_comm_tools.feature_dict import feature_dict
import pytest
import pandas as pd
import numpy as np
from numpy import nan
import logging
import itertools

test_chat_df = pd.read_csv("./output/chat/test_chat_level_chat.csv")
test_conv_df = pd.read_csv("./output/conv/test_conv_level_conv.csv")
test_chat_complex_df = pd.read_csv(
    "./output/chat/test_chat_level_chat_complex.csv")
test_conv_complex_df = pd.read_csv(
    "./output/conv/test_conv_level_conv_complex.csv")
test_conv_complex_df_ts = pd.read_csv(
    "./output/conv/test_conv_level_conv_complex_ts.csv")
test_forward_flow_df = pd.read_csv("./output/chat/test_forward_flow_chat.csv")

# Import the Feature Dictionary

chat_features = [feature_dict[feature]["columns"]
                 for feature in feature_dict.keys() if feature_dict[feature]["level"] == "Chat"]
conversation_features = [feature_dict[feature]["columns"] for feature in feature_dict.keys(
) if feature_dict[feature]["level"] == "Conversation"]

num_features_chat = len(list(itertools.chain(*chat_features)))
num_features_conv = len(list(itertools.chain(*conversation_features))) - 3


num_tested_chat = test_chat_df['expected_column'].nunique() + test_chat_complex_df['feature'].nunique() + test_forward_flow_df['feature'].nunique()
test_chat = pd.concat([test_chat_df['expected_column'], test_chat_complex_df['feature'], test_forward_flow_df['feature']])
test_chat = test_chat.drop_duplicates()

In [10]:
hi = pd.concat([test_chat_df['expected_column'].drop_duplicates(), test_chat_complex_df['feature'].drop_duplicates(), test_forward_flow_df['feature'].drop_duplicates()])
hi.to_csv("hi.csv")

In [1]:
import pandas as pd
test_conv_complex_df = pd.read_csv("./output/conv/test_conv_level_conv_complex.csv")
test_conv_complex_df[['conversation_num', "gini_coefficient_sum_num_messages"]]

  from pandas.core import (


Unnamed: 0,conversation_num,gini_coefficient_sum_num_messages
0,A,0.041667
1,B,0.0
2,C,0.297101
3,D,0.066667
4,E,0.0
5,F,0.0
6,G,0.0
7,H,0.0
8,I,0.0
9,J,0.0


In [94]:
def get_function_words_in_message(text, function_word_reference):
    """
    Extract the function words & non-functions words from a message

    Args:
        text (str): The input text to be analyzed.
        function_word_reference (list): A list of function words to reference against.

    Returns:
        list: A list of function words found in the input text.
    """
    if (pd.isna(text)):
        return []
    return [x for x in text.split() if x in function_word_reference]


def get_content_words_in_message(text, function_word_reference):
    """
    Extract the non-function words in a given message.

    Args:
        text (str): The input text to be analyzed.
        function_word_reference (list): A list of function words to reference against.

    Returns:
        list: A list of content words found in the input text.
    """
    if (pd.isna(text)):
        return []
    return [x for x in text.split() if x not in function_word_reference]


def mimic_words(df, on_column, conversation_id):
    """
    Return a list of words that are also used in the other's previous turn.

    Args:
        df (DataFrame): The dataset that removed all punctuations.
        on_column (str): The column that we want to find mimicry on.
        conversation_id (str): The column name that should be selected as the conversation ID.

    Returns:
        list: A list of lists, where each sublist contains words mimicked from the previous turn.
    """
    word_mimic = [[]]
    for i in range(1, len(df)):
        # only do this if they're in the same conversation
        if df.loc[i, conversation_id] == df.loc[i-1, conversation_id]:
            word_mimic.append([x for x in df.loc[i, on_column]
                              if x in df.loc[(i-1), on_column]])
        else:
            word_mimic.append([])
    return word_mimic


def function_mimicry_score(function_mimic_words):
    """
    Compute the number of mimic words for function words by simply counting the number of mimic words using len().

    Args:
        function_mimic_words (list): Each entry under the `function_word_mimicry` column.

    Returns:
        int: The number of function mimic words.
    """
    return len(function_mimic_words)


def compute_frequency(df, on_column):
    """
    Compute the frequency of each content word across the whole dataset.

    Args:
        df (DataFrame): The input dataframe.
        on_column (str): The column with which we calculate content word frequency.

    Returns:
        dict: A dictionary with content words as keys and their frequencies as values.
    """
    df_temp = df.copy()
    df_temp.reset_index(drop=True, inplace=True)
    return (dict(pd.Series(np.concatenate(df_temp[on_column])).value_counts()))


def computeTF(column_mimc, frequency_dict):
    """
    Compute the term frequency of each content mimic word, then sum them up.

    Args:
        column_mimc (list): Each entry under the `content_word_mimicry` column.
        frequency_dict (dict): A dictionary of content word frequency across the dataset.

    Returns:
        float: The sum of term frequencies for the content mimic words.
    """
    tfdict = {}
    wf = pd.Series(column_mimc, dtype='str').value_counts()
    for i in wf.index:
        tfdict[i] = wf[i]/frequency_dict[i]
    return sum(tfdict.values())


def Content_mimicry_score(df, column_count_frequency, column_count_mimic):
    """
    Combine the steps to compute the content word mimicry score.

    Args:
        df (DataFrame): The input dataframe.
        column_count_frequency (str): The column with content words to calculate frequency.
        column_count_mimic (str): The column with content word mimicry.

    Returns:
        Series: A series with content word accommodation scores.

    """
    # Compute the frequency of each content word across the whole dataset
    ContWordFreq = compute_frequency(df, column_count_frequency)
    # Compute the content_mimicry_score
    return df[column_count_mimic].apply(lambda x: computeTF(x, ContWordFreq)) 
    # content_mimic_scores = []
    # for conv in df['conversation_num'].unique():
    #     df_conv = df[df['conversation_num'] == conv]      
    #     ContWordFreq = compute_frequency(df_conv, column_count_frequency)
    #     content_mimic_scores.append(df_conv[column_count_mimic].apply(lambda x: computeTF(x, ContWordFreq)).tolist())
    # return [item for sublist in content_mimic_scores for item in sublist]

In [1]:
import os
import numpy as np

def get_function_words():
    """
    Returns the list of function words according to Ranganath, Jurafsky, and McFarland (2013).

    Reference: https://web.stanford.edu/~jurafsky/pubs/ranganath2013.pdf

    :return: A list of function words.
    :rtype: list
    """
    # current_dir = os.path.dirname(__file__)
    function_word_file_path = "../src/team_comm_tools/features/lexicons/function_words.txt"
    # function_word_file_path = os.path.abspath(function_word_file_path)

    with open(function_word_file_path, 'r') as file:
        function_word_list = [line.strip() for line in file]
    return function_word_list

func = get_function_words()

In [2]:
import pandas as pd
from word_mimicry import *

test_chat_df = pd.read_csv("./output/chat/test_chat_level_chat.csv")
df = test_chat_df[['conversation_num', "message"]].reset_index(drop=True)
df
# df[-4:]

df["function_words"] = df["message"].apply(
    lambda x: get_function_words_in_message(x, function_word_reference=func))
df["content_words"] = df["message"].apply(
    lambda x: get_content_words_in_message(x, function_word_reference=func))

# Extract the function words / content words that also appears in the immediate previous turn
df["function_word_mimicry"] = mimic_words(
    df, "function_words", "conversation_num")
df["content_word_mimicry"] = mimic_words(
    df, "content_words", "conversation_num")

# Compute the number of function words that also appears in the immediate previous turn
df["function_word_accommodation"] = df["function_word_mimicry"].apply(
    function_mimicry_score)

# Compute the sum of inverse frequency of each content word that also occurred in the other’s immediately prior turn.
df["content_word_accommodation"] = Content_mimicry_score(
    df, "content_words", "content_word_mimicry")

# Drop the function / content word columns -- we don't need them in the output
df = df.drop(columns=['function_words', 'content_words', 'function_word_mimicry', 'content_word_mimicry'])
df


  from pandas.core import (


{'hello': 1, 'i': 1, 'fish': 1, 'sentence': 1, 'has': 1, 'five': 1, 'words': 1}
{'hello': 1, 'i': 1, 'fish': 1, 'sentence': 1, 'has': 1, 'five': 1, 'words': 1}
{'hello': 1, '4': 1, 'word': 1}
{'hello': 1, '4': 1, 'word': 1}
{}
{'point': 14, 'multiple': 12, 'i': 11, '4': 11, 'super': 11, 'long': 11, 'takes': 11, 'lines': 11, 'quotes': 4, 'said': 4, 'quote': 4, 'line': 3, 'third': 3, 'here': 3, 'parentheses': 3, 'item': 3, 'respond': 2, 'linenthis': 2, 'linehere': 2, 'reply': 2, 'elsenthis': 2, 'quoting': 2, 'return': 2, 'character': 2, '1': 2, 'single': 2, '3': 2, 'many': 2, 'hello': 2, 'use': 2, 'httpswwwexamplenet': 2, 'sure': 2, 'two': 2, 'read': 2, 'httpswwwexamplecom': 2, 'httpswwwexamplecouk': 2, 'httpswwwexampleorg': 2, 'httpswwwexampleca': 2, 'way': 1, 'definitely': 1, 'teacher': 1, 'best': 1, 'english': 1, 'according': 1, 'other': 1, 'inside': 1, 'confidently': 1, 'moments': 1, 'properly': 1, 'closed': 1, 'nest': 1, 'useful': 1, 'sometimes': 1, 'replied': 1, 'people': 1, 'there

Unnamed: 0,conversation_num,message,function_word_accommodation,content_word_accommodation
0,1,hello i like fish,0,0.000000
1,1,this sentence has five words,0,0.000000
2,2,hello,0,0.000000
3,2,is 4 a word,0,0.000000
4,3,,0,0.000000
...,...,...,...,...
675,A,get,1,0.000000
676,B,able am are,0,0.000000
677,B,able am are hello,3,0.000000
678,B,able am hello yoyo,2,0.333333


In [3]:
test_chat_df = pd.read_csv("./output/chat/test_chat_level_chat.csv")
test_chat_df['content_word_accommodation']

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
675    0.0
676    0.0
677    0.0
678    0.1
679    0.6
Name: content_word_accommodation, Length: 680, dtype: float64

In [79]:
import pandas as pd
test_chat_df = pd.read_csv("./data/cleaned_data/test_chat_level.csv")
df = test_chat_df[['conversation_num', "message"]].reset_index(drop=True)
df

df["function_words"] = df["message"].apply(
    lambda x: get_function_words_in_message(x, function_word_reference=func))
df["content_words"] = df["message"].apply(
    lambda x: get_content_words_in_message(x, function_word_reference=func))

# Extract the function words / content words that also appears in the immediate previous turn
df["function_word_mimicry"] = mimic_words(
    df, "function_words", "conversation_num")
df["content_word_mimicry"] = mimic_words(
    df, "content_words", "conversation_num")

# Compute the number of function words that also appears in the immediate previous turn
df["function_word_accommodation"] = df["function_word_mimicry"].apply(
    function_mimicry_score)

l = []

for conv in df["conversation_num"].unique():
    # print(conv)
    df_conv = df[df["conversation_num"] == conv]
    # Compute the sum of inverse frequency of each content word that also occurred in the other’s immediately prior turn.
    df_conv["content_word_accommodation"] = Content_mimicry_score(
        df_conv, "content_words", "content_word_mimicry")
    l.append(df_conv['content_word_accommodation'].tolist())
    
# flatten the list
l = [item for sublist in l for item in sublist]
l
df['content_word_accommodation'] = l
df


  conversation_num                        message
0                1             Hello I like fish.
1                1  This sentence has five words.
{'Hello': 1, 'fish.': 1, 'This': 1, 'sentence': 1, 'has': 1, 'five': 1, 'words.': 1}
0    0
1    0
Name: content_word_accommodation, dtype: int64
  conversation_num       message
2                2       Hello??
3                2  Is 4 a word?
{'Hello??': 1, 'Is': 1, '4': 1, 'word?': 1}
0    0
1    0
Name: content_word_accommodation, dtype: int64
  conversation_num message
4                3       .
{'.': 1}
0    0
Name: content_word_accommodation, dtype: int64
   conversation_num                                            message
5                 4   HELLO WORLD, THIS IS A TEST. hi HI. hi HI hi HI"
6                 4    ONE TWO THREE. four five six. sEvEn EiGhT nInE.
7                 4  Check out this [link](https://example.com) and...
8                 4  I like google.com and wikipedia.org but not am...
9                 4  why don

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_conv["content_word_accommodation"] = Content_mimicry_score(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_conv["content_word_accommodation"] = Content_mimicry_score(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_conv["content_word_accommodation"] = Content_mimicry_score(
A value is tryin

Unnamed: 0,conversation_num,message,function_words,content_words,function_word_mimicry,content_word_mimicry,function_word_accommodation,content_word_accommodation
0,B,able am are,"[able, am, are]",[],[],[],0,0.000000
1,B,able am are hello,"[able, am, are]",[hello],"[able, am, are]",[],3,0.000000
2,B,able am hello yoyo,"[able, am]","[hello, yoyo]","[able, am]",[hello],2,0.000000
3,B,hello yoyo,[],"[hello, yoyo]",[],"[hello, yoyo]",0,0.000000
4,A,get,[get],[],[get],[],1,0.000000
...,...,...,...,...,...,...,...,...
675,A,get,[get],[],[get],[],1,0.000000
676,B,able am are,"[able, am, are]",[],[],[],0,0.000000
677,B,able am are hello,"[able, am, are]",[hello],"[able, am, are]",[],3,0.000000
678,B,able am hello yoyo,"[able, am]","[hello, yoyo]","[able, am]",[hello],2,0.333333


In [2]:
import pandas as pd
test_chat_df = pd.read_csv("./output/chat/test_chat_level_chat.csv")
test_info_exchange_zscore_df = pd.read_csv("./output/chat/info_exchange_zscore_chats.csv")
pd.concat([test_chat_df, test_info_exchange_zscore_df], axis=0)

Unnamed: 0,conversation_num,speaker_nickname,message,expected_column,expected_value,message_original,message_lower_with_punc,positive_bert,negative_bert,neutral_bert,...,num_numbered_points,num_line_breaks,num_quotes,num_block_quote_responses,num_ellipses,num_parentheses,num_emoji,mimicry_bert,moving_mimicry,forward_flow
0,1,A,hello i like fish,num_words,4.00000,Hello I like fish.,hello i like fish.,0.897070,0.004484,0.098446,...,0,1,0,0,0,0,0,0.000000,0.000000,0.000000e+00
1,1,B,this sentence has five words,num_words,5.00000,This sentence has five words.,this sentence has five words.,0.041938,0.131370,0.826692,...,0,1,0,0,0,0,0,0.113466,0.113466,8.865345e-01
2,2,A,hello,num_words,1.00000,Hello??,hello??,0.138002,0.053870,0.808129,...,0,1,0,0,0,0,0,0.000000,0.000000,0.000000e+00
3,2,B,is 4 a word,num_words,4.00000,Is 4 a word?,is 4 a word?,0.042429,0.116617,0.840954,...,0,1,0,0,0,0,0,0.124232,0.124232,8.757685e-01
4,3,A,,num_words,0.00000,.,.,0.302318,0.120507,0.577175,...,0,1,0,0,0,0,0,0.000000,0.000000,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,D,2,bought some groceries for dinner,info_exchange_zscore_conversation,1.73205,Bought some groceries for dinner,bought some groceries for dinner,0.528325,0.010219,0.461456,...,0,1,0,0,0,0,0,0.378725,0.378725,6.212753e-01
2,D,3,its raining today,info_exchange_zscore_chats,-1.73205,It's raining today,it's raining today,0.285214,0.111169,0.603617,...,0,1,0,0,0,0,0,0.179277,0.279001,8.309010e-01
3,E,1,i went to the store,info_exchange_zscore_chats,0.00000,I went to the store,i went to the store,0.266087,0.029111,0.704801,...,0,1,0,0,0,0,0,0.000000,0.000000,0.000000e+00
4,E,1,i went to the store,info_exchange_zscore_chats,0.00000,I went to the store,i went to the store,0.266087,0.029111,0.704802,...,0,1,0,0,0,0,0,1.000000,1.000000,-2.220446e-16


In [8]:
import pandas as pd
import re
import nltk
import pyphen

# Define the function to calculate the Dale-Chall score
def count_syllables(word):
    """
    Count the number of syllables in a word.
    
    Args:
        word(str): The input word.

    Returns:
        int: The number of syllables in the word.
    """
    dic = pyphen.Pyphen(lang='en')
    pyphen_result = dic.inserted(word)
    return re.findall(r"-", pyphen_result)

count_syllables("difficulty")

['-', '-']

In [9]:
import re

VOWEL_RUNS = re.compile("[aeiouy]+", flags=re.I)
EXCEPTIONS = re.compile(
    # fixes trailing e issues:
    # smite, scared
    "[^aeiou]e[sd]?$|"
    # fixes adverbs:
    # nicely
    + "[^e]ely$",
    flags=re.I
)
ADDITIONAL = re.compile(
    # fixes incorrect subtractions from exceptions:
    # smile, scarred, raises, fated
    "[^aeioulr][lr]e[sd]?$|[csgz]es$|[td]ed$|"
    # fixes miscellaneous issues:
    # flying, piano, video, prism, fire, evaluate
    + ".y[aeiou]|ia(?!n$)|eo|ism$|[^aeiou]ire$|[^gq]ua",
    flags=re.I
)

def count_syllables(word):
    vowel_runs = len(VOWEL_RUNS.findall(word))
    exceptions = len(EXCEPTIONS.findall(word))
    additional = len(ADDITIONAL.findall(word))
    return max(1, vowel_runs - exceptions + additional)

count_syllables("difficulty")

4

In [4]:
import re
text = "even this is magnificent! even this is magnificent!"
num_words = len(text.split())
num_sentences = re.split(r'[.?!]\s*', text)

# remove empty strings
num_sentences = [x for x in num_sentences if x]
len(num_sentences)

2

In [35]:
import re
first_person = ["i", "me", "my", "mine", "we", "us", "our", "ours"]
first_person_regex = "\\b|\\b".join(first_person)
first_person_regex = "\\b" + first_person_regex + "\\b"
print(first_person_regex)
re.findall(first_person_regex, "hiii")


\bi\b|\bme\b|\bmy\b|\bmine\b|\bwe\b|\bus\b|\bour\b|\bours\b


[]