In [1]:

from multiprocessing import Pool
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pickle
import itertools
import time
import os
from nltk import word_tokenize


def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    
    # Clean the text, with the option to remove stopwords and to stem words.
    
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    
    text = re.sub(r"\bappeared in the.*",'',text)
    
    text = re.sub(r"what ' s", "what is ", text)
    # my turn
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"can’t", "can not ", text)
    text = re.sub(r"you’ll", "you will", text)
    text = re.sub(r"he’ll", "he will", text)
    text = re.sub(r"she’ll", "she will", text)
    
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"aren’t", "are not", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"could't", "could not", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"hasn't", "has not", text)

    
    text = re.sub(r"u\.s\.",'american',text)
    #text = re.sub(r"[^A-Za-z0-9,!+^.\/'-=?.-]", "  ", text) # origin is [^A-Za-z0-9^,!.\/'+-=?]
    
    text = re.sub(r"4get", "forget ", text)
    text = re.sub(r"coo[o]+", "cooo", text)
    text = re.sub(r"so[o]+", "sooo ", text)
    text = re.sub(r" [0-9]+ : [0-9]+", " aa:bb ", text) #remove time
    text = re.sub(r" [0-9]+", " ", text) #remove number
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"\.\.[\.]+", " a... ", text)
    text = re.sub(r"uh[h]+", " uh ", text)
    text = re.sub(r"zz[z]+", " zzz ", text)
    text = re.sub(r"loo[o]+l", " lool ", text)
    text = re.sub(r" \.\.", " ", text)
    
    #end my turn
    text = re.sub(r"\' s", " ", text)
    text = re.sub(r"\' ve", " have ", text)
    text = re.sub(r"can ' t", "can not ", text)
    text = re.sub(r"n ' t", " not ", text)
    text = re.sub(r"i ' m", "i am ", text)
    text = re.sub(r"\' re", " are ", text)
    text = re.sub(r"\' d", " would ", text)
    text = re.sub(r"\' ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r" \.", " ", text)
    
    text = re.sub(r"!![!]+", " !!! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r" \=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r" : ", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    #text = re.sub(r"\0s", "0", text)
    #text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = re.sub('”?appeared in the [a-z]* [0-9]*, [0-9]*, print edition as.*',' ',text)
    
    # remove weird punctuation
    
    # prevent two words binded 
    #for i in punctuation:
        #text = text.replace(i," ")
    #text = text.replace('  ','')
   
    
    # Optionally, shorten words to their stems
    if stem_words is True:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    token_list = []
    for i in word_tokenize(text):
        i = re.sub(r"\d+", "", i)
        i = re.sub(r'\.',' ',i)
        if len(i) > 1:
            i = re.sub(r"[^A-Za-z0-9,!+^.'\/,'-=?-]", "  ", i)
            i = i.strip()
            i = i.replace(' ','')
            if len(i) > 1:
                #remove number
                token_list.append(i)
    text = ' '.join(token for token in token_list)
    
    
    text = re.sub(r"do n't",'do not',text)
    text = re.sub(r"did n't",'do not',text)

    # Return a list of words
    return text

In [20]:
data_path = '/Users/mueric35/Downloads/wsj_haha/data'
selected_year = [str(i) for i in range(2016,2018)]
selected_year

['2016', '2017']

In [21]:
# map function
def tidy_text(key,texts_dict):
    result = text_to_wordlist(texts_dict.get(key)['paragraph'], stem_words = False, remove_stopwords= True)
    return(result)


In [26]:
def clean_year(selected_year,data_path):
    
    path = data_path
    for year in os.listdir(path):
        
        if year in selected_year:
            print('Working on ' + str(year))
            
            pk_year_path = path + '/' + year + '/' + 'wsj_' + year + '_dic.pkl'
            print('Loading: ' + pk_year_path)
            
            texts =  open(pk_year_path,'rb') 
            texts_dict = pickle.load(texts)
            keys = list(texts_dict.keys())

            # save file to input new data and key for map funciton
            tidy_texts_dict = texts_dict
            
            results = []
            keys = texts_dict.keys()
            
            start_time = time.time()
            for key in texts_dict.keys():
                tidy_texts_dict.get(key)['paragraph'] = tidy_text(key,texts_dict)
            
            print("--- %s seconds ---" % (time.time() - start_time))
            
          
            print('Finished tidying')
            save_path =  path + '/' + year + '/' + 'tidy_wsj_' + year + '_dic.pkl'
            print('Saving to ' + save_path + "\n")
            with open(save_path, 'wb') as handle:
                pickle.dump(tidy_texts_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [27]:
clean_year(selected_year,data_path)

Working on 2016
Loading: /Users/mueric35/Downloads/wsj_haha/data/2016/wsj_2016_dic.pkl
--- 354.1629378795624 seconds ---
Finished tidying
Saving to /Users/mueric35/Downloads/wsj_haha/data/2016/tidy_wsj_2016_dic.pkl

Working on 2017
Loading: /Users/mueric35/Downloads/wsj_haha/data/2017/wsj_2017_dic.pkl
--- 293.4192051887512 seconds ---
Finished tidying
Saving to /Users/mueric35/Downloads/wsj_haha/data/2017/tidy_wsj_2017_dic.pkl

