In [None]:
import pandas as pd
import re
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(name)s | %(levelname)s | %(message)s')

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
df = pd.read_csv('docs_sub_name_id.csv')

### Setting up functions for data preparation

In [None]:
# Text patterns to be removed
month_clause = '(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)'

regex_templates = {
    # Date in the format"Month day, Year / time /'
    'reuters_date_time': '((('+ month_clause +'\s+\d{1,2},\s+\d{4})))\s/\s\d{1,2}:\d{1,2}\s\w{2}\s/',
    'another_reuters_regex': '(('+ month_clause +'\s+\d{1,2},\s+\d{4}))\s\/\s\d{1,2}:\d{1,2}\s\w{2}\s',
    # Everything before /PRNewswire/: "WHITE PLAINS, N.Y., May 23, 2018 /PRNewswire/ -- Bunge Limited (NYSE: BG) today 
    'cnbc_regex': '.*/PRNewswire/\s--\s',
    # "Updates N hours(minutes) ago"
    'updated_regex': '\s?(Updated )?\d{1,2} (hours|minutes|days) ago',
    # in N minutes(hours)
    'in_minutes': '\s?in\s\d{1,2}\s(minutes|hours|days)',
    # Everything before GLOBE NEWSWIRE
    'globe_newswire': '(.*)\(GLOBE NEWSWIRE\)\s--',
    # Another Date format - '13 April 2018 - '
    'date_regex': '\d{1,2}\s'+ month_clause +'\s\d{4}\s-\s',
    # Business Wire
    'business_wire': '(.*)--\(BUSINESS WIRE\)--',
    # COMMENTS at the beginning
    'comments': '\d{1,6}\sCOMMENTS\s',
    # Place + date at the beginning
    'place_date': '\A(.*),\s(('+ month_clause +'\s+\d{1,2},\s+\d{4}))\s-',
    # Updated an hour ago
    'hour_ago': '\A\s?Updated an hour ago',
    # N Min Read
    'min_read': '\s\d{1,2} Min Read',
    # time + in N minutes
    'time_in_minutes': '\s\d{1,2}\s\w{2}\s\/\sin\s\d{1,2}\sminutes',
    # by with time - By Phil Wahba 7:55 AM EST
    'by_time': '\A\s?By\s(.*)\s\d{1,2}:\d{1,2}\s\w{2}\s\w{3}',
    # by with date without the year with Reuters
    'by_without_year': '\A\s?By\s(.*)'+ month_clause +'\s+\d{1,2}\s\(Reuters\)\s-',
    # by with year 
    'by_with_year': '\A\s?By\s(.*)'+ month_clause +'\s+\d{1,2},\s\d{4}',
    # City, date (Reuters)
    'city_date_reuters': '\A\w+,\s'+ month_clause +'\s+\d{1,2}\s\(Reuters\)\s-',
    # a minute ago
    'minute_ago': '\A\s?a minute ago',
    # 10:30 AM ET Sun,
    'time': '\d{1,2}:\d{2}\s(\w{2}\s)*(Mon|Tue|Wed|Thu|Fri|Sat|Sun)?(,\s)?',
    # another date
    'another_date': '(\d{1,2}\s)?('+ month_clause +')(\s+\d{1,2},)?\s+\d{2,4}',
    # phone format
    'phone_3': '(\d+(/|-){1}\d+(/|-){1}\d{2,4})',
    #phone with letter
    'phone_with_letter': '1-800-\d{3}-\w{4}',
    # just URL starting with HTTP(S)
    'url_regex' : 'https?:\/\/(www\.)?(\S+)',
    #View original content with multimedia: {URL}
    'orig_content_with_multimedia' : 'View original content with multimedia: https?:\/\/.*\n(.*)',
    # View original content
    'orig_content' : 'View original content: https?:\/\/.*\n(.*)',
    # Related Articles - URL
    'related' : 'Related Articles\nhttps?:\/\/.*',
    # About Us: contact details
    'about_us' : 'About Us:(\n(.*))+',
    # View on business wire
    'view_on_bw' : 'View source version on businesswire\.com :',
    # For further information contact:
    'for_further' : 'For further information contact:(\n(.*))+',
    # For more information visit:
    'for_more_info' : 'For more information, visit (https?:\/\/)?(www\.)?(\S+)(\s\.)?',
    # For additional information
    'for_additional' : 'For additional information,(\splease\s)?visit (https?:\/\/)?(www\.)?(\S+)(\s\.)?',
    # This press release features multimedia, view the full..
    'press_release' : 'This press release features multimedia\. View the full release here:\s(https?:\/\/)?(www\.)?(\S+)(\s\.)?',
    # For more information please
    'for_more_info_please' : 'For more information,(\splease)?\svisit\s(https?:\/\/)?(www\.)?(\S+)(\s\.)?',
    #Email
    'email' : '\S*@\S*\s?',
    # Visit at or follow
    'visit_or_follow' : '((V|v)isit)(.*)(at)?follow us on(.*)',
    # For more information
    'for_more_info' : 'For more information[^,]+,\svisit[^.]+.',
    # Follow us on
    'follow_us_on' : 'Follow us on[^.]+.',
    # Follow us on Twitter
    'follow_us_on_twitter' : 'follow us on Twitter(:)?\s@[^\s]+',
    # TO learn more go to
    'go_to' : 'Like (U|u)s on Facebook',
    # LIke this story
    'like_this_story' : 'Like this story?(.*)',
    #phone
    'phone' : '(\+?\s?\d{1,3}\s)?(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})',
    #contact telephone email
    'contact_telephone_email' : 'Contact:(.*)?(Date:)?(.*)?((Tele)?phone)?(.*)?Email:',
    # Matches any URL
    'url_all' : '((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:".,<>?«»“”‘’]))'

}

In [None]:
# Finding and replacing regex patterns in text

def find_regex(text, template):
    res = re.sub(template,  ' ', text)
    return res

In [None]:
# Initial cleanup

def cleanup_and_to_lower(text):
    """
    1. remove extra white spaces (tabs, new lines, spaces, etc.)
    2. bring text to lower case    
    """
    text = " ".join(text.split())
    text = text.lower()
    return text

In [None]:
# Removing punctuation
# There are two options available. 'remove_all_punct' is a default. Make changes to 'do_data_prep' function if necessary.

def remove_all_punct(sentences):
    """
    removes all punctuation except $
    """
    new_sents = []
    for sent in sentences:
        punct="\"|\';,[]:?!=%#()—\\/~*+<>@^{}-.“”‘’"
        for char in punct:
            sent = sent.replace(char,' ')
        new_sents.append(sent)
    return new_sents

def remove_some_punct(sentences):
    """
    removes all punctuation except $ ! " % “”
    """
    new_sents = []
    for sent in sentences:
        punct="|;,\'-[]:?=—#\(\)\\\/~*+<>@^‘’"
        for char in punct:
            sent = sent.replace(char,' ')
        new_sents.append(sent)
    return new_sents

In [None]:
# Removing stopwords

def remove_stopwords(sentences):
    new_sents = []
    for sent in sentences:
        sent_without_stopwords = [word for word in sent.split() if word not in stop_words]
        new_sents.append(" ".join(sent_without_stopwords))
    return new_sents

In [None]:
# Lemmatization
# There are two options available. 'lemmatize_spacy' is a default. Make changes to 'do_data_prep' function if necessary.

def lemmatize_spacy(sentences):
    new_sents = []
    for sent in sentences:
        sent = nlp(sent)
        lemmatized_sent = []
        for token in sent:
            lemmatized_sent.append(token.lemma_)
        new_sents.append(" ".join(lemmatized_sent))
    return new_sents


lemmatizer = WordNetLemmatizer()
def lemmatize_nltk(sentences):
    new_sents = []
    for sent in sentences:
        token_words=word_tokenize(sent)
        token_words
        lemmatized_sent=[]
        for word in token_words:
            lemmatized_sent.append(lemmatizer.lemmatize(word))
        new_sents.append(" ".join(lemmatized_sent))
    return new_sents

### Data preparation function

In [None]:
stop_words = set(stopwords.words('english'))
def do_data_prep(df):
    """
    1. removing unnecessary text patterns
    2. removing extra white spaces (tabs, new lines, spaces, etc.) and bringing to lower case
    3. tokenizing sentences
    4. removing punctuation
    5. removing stopwords
    6. lemmatizing sentences
    """
    logging.info('Creating a new column...')
    df['prep_text'] = df.text_w_ids
    
    logging.info('Removing extra text patterns...')
    for temp in regex_templates:
        df.prep_text = df.prep_text.apply(lambda x: find_regex(x, regex_templates[temp]))
        
    logging.info('Removing extra spaces and bringing text to lower case...')    
    df['prep_text'] = df.prep_text.apply(cleanup_and_to_lower)
    
    logging.info('Tokenizing text into sentences...')
    df['prep_text'] = df.prep_text.apply(sent_tokenize)
    
    logging.info('Removing text punctuation...')
    df['prep_text'] = df.prep_text.apply(remove_all_punct)
    
    logging.info('Removing stopwords...')
    df['prep_text'] = df.prep_text.apply(remove_stopwords)
    
    logging.info('Performing lemmatization...')
    df['prep_text'] = df.prep_text.apply(lemmatize_spacy)
    
    logging.info('Finished')
    return df

### Performing data preparation and exporting results to .csv

In [None]:
df = do_data_prep(df)

In [None]:
df.to_csv('docs_preprocessed.csv',index=False)