# Imports

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import re

In [34]:
data = {
    'raw_message': [
        'hello',
        'hi', # for some reason this triggers really large values in things like 'bio' and 'cognitive mech'
        'hi!',
        'I am happy today', # first_person is always 0
        'i am good', # first_person is always 0
        '()()()()' # all symbols leads to the regex returning len(regex)
    ]
}

def preprocess_text(text):
  	# For each individual message: preprocess to remove anything that is not an alphabet or number from the string
	return(re.sub(r"[^a-zA-Z0-9 ]+", '',text).lower())

chat_df = pd.DataFrame(data)

chat_df["message"] = chat_df["raw_message"].apply(preprocess_text)

In [35]:
chat_df

Unnamed: 0,raw_message,message
0,hello,hello
1,hi,hi
2,hi!,hi
3,I am happy today,i am happy today
4,i am good,i am good
5,()()()(),


In [36]:
with open("../../features/lexicons_dict.pkl", "rb") as lexicons_pickle_file:
    lexicons_dict = pickle.load(lexicons_pickle_file)

In [37]:
test_df = pd.concat(
		# Finding the # of occurances of lexicons of each type for all the messages.
		[pd.DataFrame(chat_df["message"].apply(lambda chat: len(re.findall(regex, chat))))\
			  							.rename({"message": lexicon_type}, axis=1)\
			for lexicon_type, regex in lexicons_dict.items()], 
		axis=1
	)

In [47]:
test_df.loc[4, :]

discrepancies            0
hear                     0
home                     0
conjunction              0
certainty                0
inclusive                0
bio                      0
achievement              0
adverbs                  0
anxiety                  0
third_person             0
negation                 0
swear                    0
death                    0
health                   0
see                      0
body                     0
family                   0
negative_affect          0
quantifier               0
positive_affect          1
insight                  0
humans                   0
present_tense            2
future_tense             0
past_tense               0
relative                 1
sexual                   0
inhibition               0
sadness                  0
social                   0
indefinite_pronoun       0
religion                 0
work                     0
money                    0
causation                0
anger                    0
f

In [29]:
for lexicon_type, regex in lexicons_dict.items():
    print(lexicon_type)
    text = "hi"
    search_hits = re.findall(text, regex)
    print(search_hits)
    break

discrepancies
['hi']


In [30]:
regex

"\\bbesides\\S*\\b|\\bcould\\S*\\b|\\bcouldnt\\S*\\b|\\bcouldn't\\S*\\b|\\bcouldve\\S*\\b|\\bcould've\\S*\\b|\\bdesir\\S*\\b|\\bexpect\\S*\\b|\\bhope\\S*\\b|\\bhoped\\S*\\b|\\bhopeful\\S*\\b|\\bhopefully\\S*\\b|\\bhopefulness\\S*\\b|\\bhopes\\S*\\b|\\bhoping\\S*\\b|\\bideal\\S*\\b|\\bif\\S*\\b|\\bimpossib\\S*\\b|\\binadequa\\S*\\b|\\black\\S*\\b|\\bliabilit\\S*\\b|\\bmistak\\S*\\b|\\bmust\\S*\\b|\\bmustnt\\S*\\b|\\bmust'nt\\S*\\b|\\bmustn't\\S*\\b|\\bmustve\\S*\\b|\\bmust've\\S*\\b|\\bneed\\S*\\b|\\bneeded\\S*\\b|\\bneeding\\S*\\b|\\bneednt\\S*\\b|\\bneed'nt\\S*\\b|\\bneedn't\\S*\\b|\\bneeds\\S*\\b|\\bnormal\\S*\\b|\\bought\\S*\\b|\\boughta\\S*\\b|\\boughtnt\\S*\\b|\\bought'nt\\S*\\b|\\boughtn't\\S*\\b|\\boughtve\\S*\\b|\\bought've\\S*\\b|\\boutstanding\\S*\\b|\\bprefer\\S*\\b|\\bproblem\\S*\\b|\\brather\\S*\\b|\\bregardless\\S*\\b|\\bregret\\S*\\b|\\bshould\\S*\\b|\\bshouldnt\\S*\\b|\\bshould'nt\\S*\\b|\\bshouldn't\\S*\\b|\\bshoulds\\S*\\b|\\bshouldve\\S*\\b|\\bshould've\\S*\\b|\\bund

In [31]:
re.match("hi", regex)

In [32]:
re.findall(regex, "hi")

[]