## NLP Scratchpad for Data Preparation

In [1]:
# imports

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

### Exercises
- The end result of this exercise should be a file named prepare.py that defines the requested functions.

- In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
# let's establish some original text to build our function on
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
#lowercase all letters in the text

article = original.lower()

article

"paul erdős and george pólya are influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [4]:
# Normalizaton: Remove inconsistencies in unicode charater encoding.
# encode the strings into ASCII byte-strings (ignore non-ASCII characters)
# decode the byte-string back into a string

article = unicodedata.normalize('NFKD', article)\
.encode('ascii', 'ignore')\
.decode('utf-8')

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [5]:
# remove anything that is not a through z, a number, a single quote, or whitespace

article = re.sub(r"[^a-z0-9'\s]", '', article)

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [6]:
def basic_clean(original):
    '''
    This funtion will take in a single string, 
    - lowercase all of the characters, 
    - normalize unicode characters, 
    - replace anything that is not a letter/number/whitespace/single quote.
    '''
    
    #lowercase all letters in the text
    article = original.lower()
    
    # Normalizaton: Remove inconsistencies in unicode charater encoding.
    # encode the strings into ASCII byte-strings (ignore non-ASCII characters)
    # decode the byte-string back into a string
    article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8')
    
    # remove anything that is not a through z, a number, a single quote, or whitespace
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    
    return article

In [7]:
prepped_article = basic_clean(original)

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [8]:
# Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

# Use the tokenizer
article = tokenizer.tokenize(article, return_str = True)

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [9]:
def tokenize(prepped_article):
    '''
    This function takes in the result of my basic_clean function (a single, cleaned string) and tokenizes all the words in the string.
    It returns the tokenized string as a list
    '''
    
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # Use the tokenizer
    tokenized_article = tokenizer.tokenize(prepped_article)
    
    return tokenized_article

In [10]:
tokenized_article = tokenize(prepped_article)

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [11]:
def stem(tokenized_article):
    '''
    This function will take in a single string, perform a PorterStemmer, and return the stemmed string.
        
    *** This function is set up to run AFTER using the 'tokenize' function. ***
    If the 'tokenize' function has not been called, then we need to use a .split() in the for loop.
    '''
    
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Apply the stemmer to each word in our string.
    stems = [ps.stem(word) for word in tokenized_article] # Need to add .split() after tokenized_article if tokenization has not occured
    
    article_stemmed = ' '.join(stems)

    
    return article_stemmed

In [12]:
stem(tokenized_article)

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [13]:
# first I need to download 'wordnet'
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/thxmanu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def lemmatize(tokenized_article):
    '''
    This function will take in a single string, perform lemmatization, and return the lemmatized string.
    
    *** This function is set up to run AFTER using the 'tokenize' function. ***
    If the 'tokenize' function has not been called, then we need to use a .split() in the for loop.
    '''
    
    
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of tokenized words.
    lemmas = [wnl.lemmatize(word) for word in tokenized_article] # Need to add .split() after tokenized_article if tokenization has not occured
    
    
    # Join our list of words into a string again; assign to a variable to save changes.
    article_lemmatized = ' '.join(lemmas)
    
    return article_lemmatized

In [15]:
please_work = lemmatize(tokenized_article)

In [16]:
please_work

"paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [17]:
# first let's build the base function, then let's add in the additional arguments

def remove_stopwords(input_string, extra_words=None, exclude_words=None):
    '''
    This function will take in a single string ('input_string') that has already been prepped, remove all stop words, and return the string minus the stopwords.
    - [extra_words] = list of additional words to add to stopword_list; default=None
    - [exclude_words] = list of words to remove from stopword_list, and leave in 'output_string'; default=None
    - each list must be defined outside of the function
    - *** if 'input_string' has not been lemmatized, will need to do so before function can run properly.
    '''
    
    # lemmatize if necessary
#     words = lemmatize(input_string).split()

    # if 'input_string' already lemmatized,
    words = input_string.split()
    
    # define stopwords
    stopword_list = stopwords.words('english')
    
    if extra_words == None:
        stopword_list = stopword_list
        
    else:
        for word in extra_words:
            stopword_list.append(word)
            
    if exclude_words == None:
        stopword_list = stopword_list
    else:
        for word in exclude_words:
            stopword_list.remove(word)
        
    # create a list of words from my string with stopwords removed
    filtered_words = [word for word in words if word not in stopword_list]
    
    # join words in list back into strings
    article_without_stopwords = ' '.join(filtered_words)
    
    return article_without_stopwords

In [18]:
remove = ['influential', 'mathematician']
keep = ['i', 'you']

In [19]:
please_work

"paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [20]:
# categories of news articles
categories = ["business", "sports", "technology", "entertainment", "science", "world"]

In [21]:
news_df = acquire.get_all_news_articles(categories)

In [22]:
news_df

Unnamed: 0,title,content,category
0,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,business
1,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",business
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,business
3,Mahua Moitra writes to FM to look into 'over-i...,Lok Sabha MP Mahua Moitra has shared a letter ...,business
4,"Unemployment rate rises in both urban, rural a...",India's unemployment rate soared to 7.14% in t...,business
...,...,...,...
142,US offers further air support to Afghan troops...,The US will continue to carry out airstrikes a...,world
143,Afghan Army chief postpones India visit amid T...,Afghan Army chief General Wali Mohammad Ahmadz...,world
144,46 Afghan soldiers flee to Pakistan in retreat...,The Pakistani Army on Monday said that 46 Afgh...,world
145,New Zealand agrees to accept alleged Islamic S...,New Zealand on Monday agreed to repatriate an ...,world


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [23]:
url_list = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 
           'https://codeup.com/data-science-myths/',
           'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
           'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
           'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

In [24]:
codeup_df = acquire.get_blog_articles(url_list)

In [25]:
codeup_df = pd.DataFrame(codeup_df)

In [26]:
codeup_df

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


### 8. For each dataframe, produce the following columns:
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

### News Articles
- [x] title to hold the title
- [x] original to hold the original article/post content
- [x] clean to hold the normalized and tokenized original with the stopwords removed.
- [x] stemmed to hold the stemmed version of the cleaned data.
- [] lemmatized to hold the lemmatized version of the cleaned data.

In [27]:
# let's see what it looks like...
news_df

Unnamed: 0,title,content,category
0,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,business
1,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",business
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,business
3,Mahua Moitra writes to FM to look into 'over-i...,Lok Sabha MP Mahua Moitra has shared a letter ...,business
4,"Unemployment rate rises in both urban, rural a...",India's unemployment rate soared to 7.14% in t...,business
...,...,...,...
142,US offers further air support to Afghan troops...,The US will continue to carry out airstrikes a...,world
143,Afghan Army chief postpones India visit amid T...,Afghan Army chief General Wali Mohammad Ahmadz...,world
144,46 Afghan soldiers flee to Pakistan in retreat...,The Pakistani Army on Monday said that 46 Afgh...,world
145,New Zealand agrees to accept alleged Islamic S...,New Zealand on Monday agreed to repatriate an ...,world


In [28]:
def create_norm_list(df, target):
    '''
    This function takes in a pandas DataFrame, and a target variable column. 
    It returns a list of the target variable that has been normalized.
    '''

    norm_list = []
    for i in df[target]:
        norm_list.append(basic_clean(i))
    
    return norm_list

In [29]:
def token_norm_list(df, target):
    '''
    This function takes in a pandas DataFrame, and a target variable column, normalizes it, and tokenizes it.
    It returns a list of the target variable that has been normalized, and tokenized.
    '''
    norm_list = create_norm_list(df=df, target=target)
    norm_token_list = []
    for i in norm_list:
        norm_token_list.append(tokenize(i))
        
    return norm_token_list

In [40]:
def easy_stop_remove(input_string):
    '''
 
    '''
    
    # define stopwords
    stopword_list = stopwords.words('english')
        
    # create a list of words from my string with stopwords removed
    filtered_words = [word for word in input_string if word not in stopword_list]
    
    # join words in list back into strings
    without_stops = ' '.join(filtered_words)
    
    return without_stops

In [46]:
def normalize_col(df, target):
    '''
    This function takes in a pandas DataFrame, and a target variable column. 
    It calls upon my create_norm_list, and token_norm_list functions to first normalize, then tokenize the target variable.
    It then removes stop words, and creates a new column in the DataFrame. Returns a pandas DataFrame.
    '''
    norm_token_list = token_norm_list(df, target)
    
    norm_tok_stop_list = []
    for i in norm_token_list:
        norm_tok_stop_list.append(easy_stop_remove(i))
    
#     df[f'{target}_normalized'] = norm_tok_stop_list
    
    return norm_tok_stop_list

In [48]:
def stem_col(df, target):
    norm_tok_stop_list = normalize_col(df, target)
    
    stem_list = []
    for i in norm_tok_stop_list:
        stem_list.append(stem(i))
        
    df[f'{target}_stemmed'] = stem_list
    
    return stem_list

In [49]:
def lem_col(df, target):
    norm_tok_stop_list = normalize_col(df, target)
    
    lem_list = []
    for i in norm_tok_stop_list:
        lem_list.append(lemmatize(i))
        
    df[f'{target}_lemmatized'] = lem_list
    
    return lem_list
    

In [50]:
def add_the_cols(df, target):
    df[f'{target}_normalized'] = normalize_col(df, target)
    df[f'{target}_stemmed'] = stem_col(df, target)
    df[f'{target}_lemmatized'] = lem_col(df, target)
    
    return df

In [51]:
add_the_cols(news_df, 'content')

Unnamed: 0,title,content,category,content_normalized,content_stemmed,content_lemmatized
0,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,business,new job posting amazon fuelled speculations ec...,n e w j o b p o s t i n g a m a z o n ...,n e w j o b p o s t i n g a m a z o n ...
1,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",business,china ' larry chen former teacher became billi...,c h i n a ' l a r r y c h e n f o r m ...,c h i n a ' l a r r y c h e n f o r m ...
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,business,tesla ceo world ' secondrichest person elon mu...,t e s l a c e o w o r l d ' s e c o n ...,t e s l a c e o w o r l d ' s e c o n ...
3,Mahua Moitra writes to FM to look into 'over-i...,Lok Sabha MP Mahua Moitra has shared a letter ...,business,lok sabha mp mahua moitra shared letter wrote ...,l o k s a b h a m p m a h u a m o i t ...,l o k s a b h a m p m a h u a m o i t ...
4,"Unemployment rate rises in both urban, rural a...",India's unemployment rate soared to 7.14% in t...,business,india ' unemployment rate soared 714 week endi...,i n d i a ' u n e m p l o y m e n t r a ...,i n d i a ' u n e m p l o y m e n t r a ...
...,...,...,...,...,...,...
142,US offers further air support to Afghan troops...,The US will continue to carry out airstrikes a...,world,us continue carry airstrikes taliban support a...,u s c o n t i n u e c a r r y a i r s t ...,u s c o n t i n u e c a r r y a i r s t ...
143,Afghan Army chief postpones India visit amid T...,Afghan Army chief General Wali Mohammad Ahmadz...,world,afghan army chief general wali mohammad ahmadz...,a f g h a n a r m y c h i e f g e n e r ...,a f g h a n a r m y c h i e f g e n e r ...
144,46 Afghan soldiers flee to Pakistan in retreat...,The Pakistani Army on Monday said that 46 Afgh...,world,pakistani army monday said 46 afghan soldiers ...,p a k i s t a n i a r m y m o n d a y s ...,p a k i s t a n i a r m y m o n d a y s ...
145,New Zealand agrees to accept alleged Islamic S...,New Zealand on Monday agreed to repatriate an ...,world,new zealand monday agreed repatriate alleged i...,n e w z e a l a n d m o n d a y a g r e ...,n e w z e a l a n d m o n d a y a g r e ...
