# Imports

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Lexicon Helper Functions

In [86]:
import itertools
import re
import os,glob

"""
file: lexical_features.py
---
Defines features that involve bag-of-words counts from a lexicon.
"""

'''
function: get_lexicon_list_from_txt

Takes in a .txt file, in which each line is a lexicon term, and reads it into a list.

@param txt_file: name of the text file
'''
def get_lexicon_list_from_txt(txt_file):
	with open(txt_file) as lexicon:
		# return list of each word

		'''
		This list comprehension is a bit complicated, since it embeds some preprocessing.
		- What we really want to do is return line.rstrip(). However, we have to also do the following:
		- We want to capture each word, so we have to append the string start (^) and string end ($) characters
		- We have to replace any cases where "**" occurs, as python throws an error
		- The escape character, backslash, also breaks python's regex , so we have to remove it
		'''
		return(["^" + re.sub("\\\\", "", re.sub("\*", ".\*", re.sub("\*\*", "\*", line.rstrip()))) + "$" for line in lexicon])

'''
function: get_lexical_value_from_text

Takes in a lexicon list, and returns the number of matches within a given message or string.

@param text: the message/text that we are searching for lexicon words in.
@param lexicon_list: output of `get_lexicon_list_from_text`; a list of regexes or words that 
we are searching for inside the text.
'''
def get_lexical_value_from_text(text, lexicon_list):

	# preprocess to remove special characters
	# TODO -- remove any feature-level preprocessing, as we are combining them into preprocess.py
	text = re.sub('[^a-zA-Z ]+', '', text).lower()

	# Finds all matches from the lexicon, and flattens into a single list
	matches = list(itertools.chain(*[re.findall(regex, word) for word in text.split(' ') for regex in lexicon_list]))
	return(len(matches))

"""
LIWC Features

Create features drawn from the LIWC lexicons.

@ param text: the text being evaluated.
@ return value: a dictionary, in which each key is the name of the feature, and each value
is the leixcal value (count) within the text.
"""
def liwc_features(text):

	lexical_feature_dictionary = {}

	# Open every file in the folder
	directory = '../../features/lexicons/liwc_lexicons/'
	for filename in os.listdir(directory):
		lexicon_list = get_lexicon_list_from_txt(directory + filename)
		lexical_value = get_lexical_value_from_text(text, lexicon_list)
		lexical_feature_dictionary[filename] = lexical_value

	return(lexical_feature_dictionary)


# Dataset

In [87]:
data = pd.read_csv("../../data/raw_data/juries_tiny_for_testing.csv")
print(data.shape)
data.head()

(97, 10)


Unnamed: 0,batch_num,round_num,speaker_hash,speaker_nickname,timestamp,message,majority_pct,num_flipped,flipped_pct,num_votes
0,0,0,5e7e1e0031f4e454e196c30b,niceRhino,2020-04-20T18:27:20.125Z,Hello!,1.0,1,0.333333,3
1,0,0,5e31d6e4e31c5304c46f1413,culturedCow,2020-04-20T18:27:23.764Z,Hi!,1.0,1,0.333333,3
2,0,0,5e7e4f4c31f4e454e196c9c4,spryBison,2020-04-20T18:27:27.724Z,Hello,1.0,1,0.333333,3
3,0,0,5d482ea421c9be351f762255,youngLion,2020-04-20T18:27:30.410Z,Hi,1.0,1,0.333333,3
4,0,0,5e84cc3c50f6e364321d6265,smallGiraffe,2020-04-20T18:27:35.506Z,hi,1.0,1,0.333333,3


In [88]:
big_data = pd.read_csv("../../data/raw_data/jury_conversations_with_outcome_var.csv")
big_data.groupby(by=["batch_num", "round_num"]).count().iloc[:, 1].reset_index()

Unnamed: 0,batch_num,round_num,speaker_nickname
0,0,0,52
1,0,2,45
2,1,0,44
3,1,3,42
4,2,2,48
...,...,...,...
343,173,2,28
344,175,1,96
345,175,3,102
346,177,2,54


In [89]:
len_messages = []
for message in data["message"].str.split(" "):
    len_messages.append(len(message))
sum(len_messages)/len(len_messages)

15.742268041237113

In [90]:
len(os.listdir("../../features/lexicons/liwc_lexicons/"))

51

In [91]:
lexical_feature_dictionary = {}
for filename in os.listdir("../../features/lexicons/liwc_lexicons/"):
    filepath = "../../features/lexicons/liwc_lexicons/"+filename
    lexicon_list = get_lexicon_list_from_txt(filepath)
    lexical_value = get_lexical_value_from_text('Hello! How are you?', lexicon_list)
    lexical_feature_dictionary[filename] = lexical_value

In [92]:
len_lex = []
for filename in os.listdir("../../features/lexicons/liwc_lexicons/"):
    filepath = "../../features/lexicons/liwc_lexicons/"+filename
    lexicon_list = get_lexicon_list_from_txt(filepath)
    len_lex.append(len(lexicon_list))
sum(len_lex)/len(len_lex)    

149.64705882352942

In [93]:
get_lexical_value_from_text('Hello! How are you?', get_lexicon_list_from_txt("../../features/lexicons/liwc_lexicons/"+"social"))

2

In [94]:
#text = "Hello! How are you?"
text = "I believe that I am thinking about me, ourselves, and us"
text = re.sub('[^a-zA-Z ]+', '', text).lower()
text

'i believe that i am thinking about me ourselves and us'

In [95]:
#lexicon_list = get_lexicon_list_from_txt("../../features/lexicons/liwc_lexicons/"+"social")
lexicon_list = get_lexicon_list_from_txt("../../features/lexicons/other_lexicons/"+"first_person.txt")

In [96]:
for word in text.split(" "):
    for regex in lexicon_list:
        if re.search(regex, word):
            print(regex, word, re.findall(regex, word))

^i$ i ['i']
^i$ i ['i']
^me$ me ['me']
^ourselves$ ourselves ['ourselves']


In [97]:
master_regex = "|".join(lexicon_list)

In [98]:
master_regex

'^i$|^me$|^my$|^myself$|^mine$|^we$|^our$|^ours$|^ourselves$|^lets$'

In [99]:
value = 0
for word in text.split(" "):
    if re.search(master_regex, word):
        value+=1
value

4

In [100]:
master_regex_new = master_regex.replace("^", "").replace("$", "")

In [101]:
master_regex_new

'i|me|my|myself|mine|we|our|ours|ourselves|lets'

In [102]:
len(re.findall(master_regex_new, text))

7

In [103]:
lexical_feature_dictionary

{'discrepancies': 0,
 'hear': 0,
 'home': 0,
 'conjunction': 1,
 'certainty': 0,
 'inclusive': 0,
 'bio': 0,
 'achievement': 0,
 'adverbs': 1,
 'anxiety': 0,
 'third_person': 0,
 'negation': 0,
 'swear': 0,
 'death': 0,
 'health': 0,
 'see': 0,
 'body': 0,
 'family': 0,
 'negative_affect': 0,
 'quantifier': 0,
 'positive_affect': 0,
 'insight': 0,
 'humans': 0,
 'present_tense': 1,
 'future_tense': 0,
 'past_tense': 0,
 'relative': 0,
 'sexual': 0,
 'inhibition': 0,
 'sadness': 0,
 'social': 2,
 'indefinite_pronoun': 0,
 'religion': 0,
 'work': 0,
 'money': 0,
 'causation': 1,
 'anger': 0,
 'first_person_singular': 0,
 'feel': 0,
 'tentativeness': 0,
 'exclusive': 0,
 'verbs': 1,
 'friends': 0,
 'article': 0,
 'argue': 0,
 'auxiliary_verbs': 1,
 'cognitive_mech': 1,
 'preposition': 0,
 'first_person_plural': 0,
 'percept': 0,
 'second_person': 1}

In [104]:
data.iloc[0, list(data.columns).index("message")]

'Hello! '

In [105]:
# Test liwc_features directly
