# Imports

In [10]:
import os
import pickle
import re

# Preprocessing Lexicons

Here, we take the following steps:
- Read all the lexicon files
- Replace and clean up the special characters
- Express all lexicons in the following regular expression format: 
  ```
  <String Begining or whitespace>lexicon<Any character for any number of times><Line End or whitespace>
  ```

- Finally, we join all these individual regular expressions into one master regex usinf the or operator `("|")`

In [11]:
lexicons_dict = {}

In [12]:
def read_in_lexicons(directory):
    for filename in os.listdir(directory):
        with open(directory+filename, encoding = "mac_roman") as lexicons:
            if filename.startswith("."):
                continue
            lines = []
            for lexicon in lexicons:
                # get rid of parentheses
                lexicon = lexicon.replace('(', '')
                lexicon = lexicon.replace(')', '')
                if '*' not in lexicon:
                    lines.append(r"\b" + lexicon.replace("\n", "") + r"\b")
                else:
                    # get rid of any cases of multiple repeat -- e.g., '**'
                    lexicon = lexicon.replace('\**', '\*')

                    # build the final lexicon
                    lines.append(r"\b" + lexicon.replace("\n", "").replace("*", "") + r"\S*\b")
        clean_name = re.sub('.txt', '', filename)
        lexicons_dict[clean_name] = "|".join(lines)

In [13]:
read_in_lexicons(directory="../lexicons/liwc_lexicons/") # Reads in LIWC Lexicons
read_in_lexicons(directory="../lexicons/other_lexicons/") # Reads in Other Lexicons

# Saving Preprocessed Lexicons Dictionary

In [14]:
with open("../lexicons_dict.pkl", "wb") as lexicons_pickle_file:
    pickle.dump(lexicons_dict, lexicons_pickle_file)

with open("../lexicons_dict.pkl", "rb") as lexicons_pickle_file:
    lexicons_dict_loaded = pickle.load(lexicons_pickle_file)

lexicons_dict_loaded == lexicons_dict

True