## NLP Scratchpad for Data Preparation

In [1]:
# imports

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

### Exercises
- The end result of this exercise should be a file named prepare.py that defines the requested functions.

- In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
# let's establish some original text to build our function on
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
#lowercase all letters in the text

article = original.lower()

article

"paul erdős and george pólya are influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [4]:
# Normalizaton: Remove inconsistencies in unicode charater encoding.
# encode the strings into ASCII byte-strings (ignore non-ASCII characters)
# decode the byte-string back into a string

article = unicodedata.normalize('NFKD', article)\
.encode('ascii', 'ignore')\
.decode('utf-8')

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [5]:
# remove anything that is not a through z, a number, a single quote, or whitespace

article = re.sub(r"[^a-z0-9'\s]", '', article)

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [6]:
def basic_clean(string):
    '''
    This funtion will take in a single string, 
    - lowercase all of the characters, 
    - normalize unicode characters, 
    - replace anything that is not a letter/number/whitespace/single quote.
    '''
    
    #lowercase all letters in the text
    string = string.lower()
    
    # Normalizaton: Remove inconsistencies in unicode charater encoding.
    # encode the strings into ASCII byte-strings (ignore non-ASCII characters)
    # decode the byte-string back into a string
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    # remove anything that is not a through z, a number, a single quote, or whitespace
    string = re.sub(r'[^\w\s]', '', string)
    
    return string

In [7]:
prepped_article = basic_clean(original)

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [8]:
# Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

# Use the tokenizer
article = tokenizer.tokenize(article, return_str = True)

article

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [9]:
def tokenize(string):
    '''
    This function takes in the result of my basic_clean function (a single, cleaned string) and tokenizes all the words in the string.
    It returns the tokenized string as a list
    '''
    
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # Use the tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [10]:
def stem(string):
    '''
    This function will take in a single string, perform a PorterStemmer, and return the stemmed string.
    '''
    
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Apply the stemmer to each word in our string.
    stems = [ps.stem(word) for word in string.split()]
    
    string = ' '.join(stems)

    
    return string

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [11]:
# first I need to download 'wordnet'
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/thxmanu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
def lemmatize(string):
    '''
    This function will take in a single string, perform lemmatization, and return the lemmatized string.
    '''
    
    
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of tokenized words.
    lemmas = [wnl.lemmatize(word) for word in string.split()] 
    
    # Join our list of words into a string again; assign to a variable to save changes.
    string = ' '.join(lemmas)
    
    return string

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [13]:
# first let's build the base function, then let's add in the additional arguments

def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function will take in a single string ('input_string') that has already been prepped, 
    remove all stop words, and return the string minus the stopwords.
    '''

    # define stopwords
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add 'extra_words' to stopword_list
    stopword_list = stopword_list.union(set(extra_words))
    
    # split words in string
    words = string.split()
        
    # create a list of words from my string with stopwords removed
    filtered_words = [word for word in words if word not in stopword_list]
    
    # join words in list back into strings
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [14]:
# categories of news articles
categories = ["business", "sports", "technology", "entertainment", "science", "world"]

In [15]:
news_df = acquire.get_all_news_articles(categories)
news_df.head()

Unnamed: 0,title,content,category
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business
1,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business
3,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business
4,Factually incorrect: INOX on report of Amazon ...,INOX Leisure denied a report that claimed Amaz...,business


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [16]:
url_list = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 
           'https://codeup.com/data-science-myths/',
           'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
           'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
           'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

In [17]:
codeup_df = acquire.get_blog_articles(url_list)

In [18]:
codeup_df = pd.DataFrame(codeup_df)

In [19]:
codeup_df

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


### 8. For each dataframe, produce the following columns:
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

### News Articles
- [x] title to hold the title
- [x] original to hold the original article/post content
- [x] clean to hold the normalized and tokenized original with the stopwords removed.
- [x] stemmed to hold the stemmed version of the cleaned data.
- [] lemmatized to hold the lemmatized version of the cleaned data.

In [20]:
# let's see what it looks like...
news_df

Unnamed: 0,title,content,category
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business
1,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business
3,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business
4,Factually incorrect: INOX on report of Amazon ...,INOX Leisure denied a report that claimed Amaz...,business
...,...,...,...
143,6 killed after rains trigger landslides in ref...,Bangladeshi officials on Tuesday said that at ...,world
144,China's claims in South China Sea have no basi...,China's territorial claims in the South China ...,world
145,Myanmar military cancels 2020 polls results wo...,Myanmar's military government on Monday cancel...,world
146,Cuba says its embassy in France targeted with ...,Cuba on Tuesday said that its embassy in Franc...,world


In [23]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df[f'cleaned_{column}'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df[f'stemmed_{column}'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df[f'lemmatized_{column}'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df

In [24]:
prep_article_data(news_df, 'content')

Unnamed: 0,title,content,category,cleaned_content,stemmed_content,lemmatized_{column},lemmatized_content
0,"Reliance Industries vaccinates 98% of workers,...",Reliance Industries has said in a statement th...,business,reliance industries said statement 98 workers ...,relianc industri ha said statement 98 worker r...,reliance industry ha said statement 98 worker ...,reliance industry ha said statement 98 worker ...
1,Speculation around our plans for crypto not tr...,Amazon on Monday denied speculations that it w...,business,amazon monday denied speculations looking acce...,amazon monday deni specul wa look accept bitco...,amazon monday denied speculation wa looking ac...,amazon monday denied speculation wa looking ac...
2,"Musk criticises Apple's 'walled garden', cobal...",Tesla's billionaire CEO Elon Musk criticised A...,business,teslas billionaire ceo elon musk criticised ap...,tesla billionair ceo elon musk criticis appl t...,tesla billionaire ceo elon musk criticised app...,tesla billionaire ceo elon musk criticised app...
3,I will most likely not be on future earnings c...,Tesla CEO and the world's second-richest perso...,business,tesla ceo worlds secondrichest person elon mus...,tesla ceo world secondrichest person elon musk...,tesla ceo world secondrichest person elon musk...,tesla ceo world secondrichest person elon musk...
4,Factually incorrect: INOX on report of Amazon ...,INOX Leisure denied a report that claimed Amaz...,business,inox leisure denied report claimed amazon indi...,inox leisur deni report claim amazon india dis...,inox leisure denied report claimed amazon indi...,inox leisure denied report claimed amazon indi...
...,...,...,...,...,...,...,...
143,6 killed after rains trigger landslides in ref...,Bangladeshi officials on Tuesday said that at ...,world,bangladeshi officials tuesday said least six p...,bangladeshi offici tuesday said least six peop...,bangladeshi official tuesday said least six pe...,bangladeshi official tuesday said least six pe...
144,China's claims in South China Sea have no basi...,China's territorial claims in the South China ...,world,chinas territorial claims south china sea basi...,china territori claim south china sea basi int...,china territorial claim south china sea basis ...,china territorial claim south china sea basis ...
145,Myanmar military cancels 2020 polls results wo...,Myanmar's military government on Monday cancel...,world,myanmars military government monday cancelled ...,myanmar militari govern monday cancel result 2...,myanmar military government monday cancelled r...,myanmar military government monday cancelled r...
146,Cuba says its embassy in France targeted with ...,Cuba on Tuesday said that its embassy in Franc...,world,cuba tuesday said embassy france attacked petr...,cuba tuesday said embassi franc attack petrol ...,cuba tuesday said embassy france attacked petr...,cuba tuesday said embassy france attacked petr...
