# Imports

In [30]:
import os
import pickle
import re

# Preprocessing Lexicons

Here, we take the following steps:
- Read all the lexicon files
- Replace and clean up the special characters
- Express all lexicons in the following regular expression format: 
  ```
  <String Begining or whitespace>lexicon<Any character for any number of times><Line End or whitespace>
  ```

- Finally, we join all these individual regular expressions into one master regex usinf the or operator `("|")`

In [31]:
lexicons_dict = {}

In [32]:
def read_in_lexicons(directory):
    for filename in os.listdir(directory):
        with open(directory+filename, encoding = "mac_roman") as lexicons:
            if filename.startswith("."):
                continue
            lines = [r"\b" + line.replace("\n", "").replace("*", "") + r"\S*\b" for line in lexicons]
        clean_name = re.sub('.txt', '', filename)
        lexicons_dict[clean_name] = "|".join(lines)

In [33]:
read_in_lexicons(directory="lexicons/liwc_lexicons/") # Reads in LIWC Lexicons
read_in_lexicons(directory="lexicons/other_lexicons/") # Reads in Other Lexicons

In [34]:
lexicons_dict

{'discrepancies': "\\bbesides\\S*\\b|\\bcould\\S*\\b|\\bcouldnt\\S*\\b|\\bcouldn't\\S*\\b|\\bcouldve\\S*\\b|\\bcould've\\S*\\b|\\bdesir\\S*\\b|\\bexpect\\S*\\b|\\bhope\\S*\\b|\\bhoped\\S*\\b|\\bhopeful\\S*\\b|\\bhopefully\\S*\\b|\\bhopefulness\\S*\\b|\\bhopes\\S*\\b|\\bhoping\\S*\\b|\\bideal\\S*\\b|\\bif\\S*\\b|\\bimpossib\\S*\\b|\\binadequa\\S*\\b|\\black\\S*\\b|\\bliabilit\\S*\\b|\\bmistak\\S*\\b|\\bmust\\S*\\b|\\bmustnt\\S*\\b|\\bmust'nt\\S*\\b|\\bmustn't\\S*\\b|\\bmustve\\S*\\b|\\bmust've\\S*\\b|\\bneed\\S*\\b|\\bneeded\\S*\\b|\\bneeding\\S*\\b|\\bneednt\\S*\\b|\\bneed'nt\\S*\\b|\\bneedn't\\S*\\b|\\bneeds\\S*\\b|\\bnormal\\S*\\b|\\bought\\S*\\b|\\boughta\\S*\\b|\\boughtnt\\S*\\b|\\bought'nt\\S*\\b|\\boughtn't\\S*\\b|\\boughtve\\S*\\b|\\bought've\\S*\\b|\\boutstanding\\S*\\b|\\bprefer\\S*\\b|\\bproblem\\S*\\b|\\brather\\S*\\b|\\bregardless\\S*\\b|\\bregret\\S*\\b|\\bshould\\S*\\b|\\bshouldnt\\S*\\b|\\bshould'nt\\S*\\b|\\bshouldn't\\S*\\b|\\bshoulds\\S*\\b|\\bshouldve\\S*\\b|\\bshoul

# Saving Preprocessed Lexicons Dictionary

In [35]:
with open("lexicons_dict.pkl", "wb") as lexicons_pickle_file:
    pickle.dump(lexicons_dict, lexicons_pickle_file)

with open("lexicons_dict.pkl", "rb") as lexicons_pickle_file:
    lexicons_dict_loaded = pickle.load(lexicons_pickle_file)

lexicons_dict_loaded == lexicons_dict

True