# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re

In [2]:
data = {
    'raw_message': [
        'hello',
        'hi', # for some reason this triggers really large values in things like 'bio' and 'cognitive mech'
        'hi!',
        'I am happy today', # first_person is always 0
        'i am good', # first_person is always 0
        '()()()()', # all symbols leads to the regex returning len(regex),
        ':-|', # another all symbol one
        'I think perhaps this is maybe fine', # hedge words
        'who what when where why',
        'I am good happy wonderful great excellent', # lots of positive words
        'under the sea and above the waves', # prepositions,
        'i can see how the family is upset because they feel the mother was disrespected but i can also understand the guy\'s feelings. why should he have to work as interpreter for his mother in law?', # TODO - weirdly, the outputs in the real thing differ from the outputs in the test here...
        'i was conflicted because i could understand his frustration however i feel he should have maybe discussed strategies with how to approach the mother in law with his wife first.'
    ]
}

def preprocess_text(text):
  	# For each individual message: preprocess to remove anything that is not an alphabet or number from the string
	return(re.sub(r"[^a-zA-Z0-9 ]+", '',text).lower())

chat_df = pd.DataFrame(data)

chat_df["message"] = chat_df["raw_message"].apply(preprocess_text)

In [3]:
chat_df

Unnamed: 0,raw_message,message
0,hello,hello
1,hi,hi
2,hi!,hi
3,I am happy today,i am happy today
4,i am good,i am good
5,()()()(),
6,:-|,
7,I think perhaps this is maybe fine,i think perhaps this is maybe fine
8,who what when where why,who what when where why
9,I am good happy wonderful great excellent,i am good happy wonderful great excellent


In [4]:
with open("../../features/lexicons_dict.pkl", "rb") as lexicons_pickle_file:
    lexicons_dict = pickle.load(lexicons_pickle_file)

In [5]:
test_df = pd.concat(
		# Finding the # of occurances of lexicons of each type for all the messages.
		[pd.DataFrame(chat_df["message"].apply(lambda chat: len(re.findall(regex, chat))))\
			  							.rename({"message": lexicon_type}, axis=1)\
			for lexicon_type, regex in lexicons_dict.items()], 
		axis=1
	)

In [6]:
test_df

Unnamed: 0,discrepancies,hear,home,conjunction,certainty,inclusive,bio,achievement,adverbs,anxiety,...,auxiliary_verbs,cognitive_mech,preposition,first_person_plural,percept,second_person,positive_words,first_person,nltk_english_stopwords,hedge_words
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,2,1,3,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,2,1,2,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,2,0,...,2,3,0,0,1,0,1,2,5,1
8,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,5,0
9,0,0,0,0,0,0,0,2,0,0,...,1,1,0,0,0,0,6,1,3,0


A few potential error cases below --- it seems we are returning more positives than necessary!

In [7]:
# seems a bunch of false positives in the first person
re.findall(lexicons_dict["first_person"], "i can see how the family is upset because they feel the mother was disrespected but i can also understand the guy\'s feelings. why should he have to work as interpreter for his mother in law?")

['i', 'is', 'i', 'interpreter', 'in']

In [8]:
re.findall(lexicons_dict["nltk_english_stopwords"], "i can see how the family is upset because they feel the mother was disrespected but i can also understand the guy\'s feelings. why should he have to work as interpreter for his mother in law?")

['i',
 'can',
 'see',
 'how',
 'the',
 'is',
 'upset',
 'because',
 'they',
 'the',
 'mother',
 'was',
 'disrespected',
 'but',
 'i',
 'can',
 'also',
 'understand',
 'the',
 's',
 'why',
 'should',
 'he',
 'have',
 'to',
 'as',
 'interpreter',
 'for',
 'his',
 'mother',
 'in']

In [9]:
re.findall(lexicons_dict["cognitive_mech"], "i can see how the family is upset because they feel the mother was disrespected but i can also understand the guy\'s feelings. why should he have to work as interpreter for his mother in law?")

['', '', '', '', '', '', '', '', '', '']

In [10]:
re.findall(lexicons_dict["preposition"], "i can see how the family is upset because they feel the mother was disrespected but i can also understand the guy\'s feelings. why should he have to work as interpreter for his mother in law?")

['upset', 'understand', 'to', 'as', 'interpreter', 'for', 'in']

In [11]:
test_df.loc[4, :]

discrepancies             0
hear                      0
home                      0
conjunction               0
certainty                 0
inclusive                 0
bio                       0
achievement               0
adverbs                   0
anxiety                   0
third_person              0
negation                  0
swear                     0
death                     0
health                    0
see                       0
body                      0
family                    0
negative_affect           0
quantifier                0
positive_affect           1
insight                   0
humans                    0
present_tense             2
future_tense              0
past_tense                0
relative                  1
sexual                    0
inhibition                0
sadness                   0
social                    0
indefinite_pronoun        0
religion                  0
work                      0
money                     0
causation           

In [12]:
for lexicon_type, regex in lexicons_dict.items():
    print(lexicon_type)
    text = "hi"
    search_hits = re.findall(text, regex)
    print(search_hits)
    break

discrepancies
['hi']


In [13]:
regex

"\\bbesides\\S*\\b|\\bcould\\S*\\b|\\bcouldnt\\S*\\b|\\bcouldn't\\S*\\b|\\bcouldve\\S*\\b|\\bcould've\\S*\\b|\\bdesir\\S*\\b|\\bexpect\\S*\\b|\\bhope\\S*\\b|\\bhoped\\S*\\b|\\bhopeful\\S*\\b|\\bhopefully\\S*\\b|\\bhopefulness\\S*\\b|\\bhopes\\S*\\b|\\bhoping\\S*\\b|\\bideal\\S*\\b|\\bif\\S*\\b|\\bimpossib\\S*\\b|\\binadequa\\S*\\b|\\black\\S*\\b|\\bliabilit\\S*\\b|\\bmistak\\S*\\b|\\bmust\\S*\\b|\\bmustnt\\S*\\b|\\bmust'nt\\S*\\b|\\bmustn't\\S*\\b|\\bmustve\\S*\\b|\\bmust've\\S*\\b|\\bneed\\S*\\b|\\bneeded\\S*\\b|\\bneeding\\S*\\b|\\bneednt\\S*\\b|\\bneed'nt\\S*\\b|\\bneedn't\\S*\\b|\\bneeds\\S*\\b|\\bnormal\\S*\\b|\\bought\\S*\\b|\\boughta\\S*\\b|\\boughtnt\\S*\\b|\\bought'nt\\S*\\b|\\boughtn't\\S*\\b|\\boughtve\\S*\\b|\\bought've\\S*\\b|\\boutstanding\\S*\\b|\\bprefer\\S*\\b|\\bproblem\\S*\\b|\\brather\\S*\\b|\\bregardless\\S*\\b|\\bregret\\S*\\b|\\bshould\\S*\\b|\\bshouldnt\\S*\\b|\\bshould'nt\\S*\\b|\\bshouldn't\\S*\\b|\\bshoulds\\S*\\b|\\bshouldve\\S*\\b|\\bshould've\\S*\\b|\\bund

In [14]:
re.match("hi", regex)

In [15]:
re.findall(regex, "hi")

[]