In [1]:
import pandas as pd
import numpy as np

import re
import unicodedata
import nltk
from nltk.corpus import stopwords

import acquire

### Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(string):
    
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    string = re.sub(r'[^a-z0-9\'\s]', '', string)
    
    return string

### Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [3]:
def tokenize(string):
    
    tokenize = nltk.tokenize.ToktokTokenizer()
    string = tokenize.tokenize(string, return_str=True)
    
    return string

### Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [4]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string = ' '.join(stems)
    
    return string

### Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [5]:
def lemmatize(string):
    
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string = ' '.join(lemmas)

    return string

### Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords. This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [6]:
list1 = [1,2,3,4,5]
list2 = []

In [7]:
set(list1)

{1, 2, 3, 4, 5}

In [8]:
set(list2)

set()

In [9]:
set(list1) - set(list2)

{1, 2, 3, 4, 5}

In [10]:
numbers = set(list1).union(set(list2))

In [11]:
[numb for numb in numbers]

[1, 2, 3, 4, 5]

In [12]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    stopwords_ls = stopwords.words('english')
    
    stopwords_ls = set(stopwords_ls) - set(exclude_words)
    stopwords_ls = stopwords_ls.union(set(extra_words))
    
    words = string.split()
    filtered_words = [word for word in words if word not in stopwords_ls]
    string = ' '.join(filtered_words)
    
    return string

### Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [13]:
articles = acquire.get_news_articles()

In [14]:
news_df = pd.DataFrame(articles)
news_df

Unnamed: 0,title,content,category
0,Bandhan Bank onboards Sourav Ganguly as brand ...,Bandhan Bank has announced Sourav Ganguly as i...,business
1,Musk is under federal probe over his conduct i...,Twitter has claimed that the world's richest p...,business
2,Layoffs will be the absolute last thing at Zoh...,Software startup Zoho's CEO Sridhar Vembu said...,business
3,Infosys let go of employees working for two co...,Infosys CEO Salil Parekh has revealed that the...,business
4,Centre announces one-time aide for paddy straw...,Centre said it'll provide one-time financial s...,business
...,...,...,...
94,"Was told I did 'Soch', 'Joker' for sympathy; c...",Singer Harrdy Sandhu said that he got messages...,entertainment
95,"When father takes care of child, society blows...","Actress Neha Dhupia said she finds it ""problem...",entertainment
96,"Mind-blowing, a must-watch: Dhanush on Rishab ...",Actor Dhanush took to Twitter to praise Rishab...,entertainment
97,Working with Salman Khan again in 'Tiger 3' is...,Speaking about working with actor Salman Khan ...,entertainment


### Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [15]:

blogs = acquire.get_blog_articles()

In [16]:
codeup_df = pd.DataFrame(blogs)
codeup_df

Unnamed: 0,title,date_published,category,content
0,Diversity Equity and Inclusion Report,"Oct 7, 2022",[Codeup News],\nCodeup is excited to launch our first Divers...
1,Codeup Honored as SABJ Diversity and Inclusion...,"Oct 7, 2022",[Codeup News],\nCodeup has been named the 2022 Diversity and...
2,How Can I Finance My Career Transition?,"Sep 29, 2022","[Cloud Administration, Data Science, Featured,...",\nDeciding to transition into a tech career is...
3,Tips for Women Beginning a Career in Tech,"Sep 23, 2022",[Tips for Prospective Students],"\nCodeup strongly values diversity, and inclus..."
4,What is Cloud Computing and AWS?,"Sep 13, 2022","[Cloud Administration, Tips for Prospective St...",\nWith many companies switching to cloud servi...


### For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [17]:
news_df = news_df.rename(columns={'content':'original'})
news_df.head()

Unnamed: 0,title,original,category
0,Bandhan Bank onboards Sourav Ganguly as brand ...,Bandhan Bank has announced Sourav Ganguly as i...,business
1,Musk is under federal probe over his conduct i...,Twitter has claimed that the world's richest p...,business
2,Layoffs will be the absolute last thing at Zoh...,Software startup Zoho's CEO Sridhar Vembu said...,business
3,Infosys let go of employees working for two co...,Infosys CEO Salil Parekh has revealed that the...,business
4,Centre announces one-time aide for paddy straw...,Centre said it'll provide one-time financial s...,business


In [18]:
news_df = news_df[['title','original']]
news_df.head()

Unnamed: 0,title,original
0,Bandhan Bank onboards Sourav Ganguly as brand ...,Bandhan Bank has announced Sourav Ganguly as i...
1,Musk is under federal probe over his conduct i...,Twitter has claimed that the world's richest p...
2,Layoffs will be the absolute last thing at Zoh...,Software startup Zoho's CEO Sridhar Vembu said...
3,Infosys let go of employees working for two co...,Infosys CEO Salil Parekh has revealed that the...
4,Centre announces one-time aide for paddy straw...,Centre said it'll provide one-time financial s...


In [19]:
news_df['clean'] = news_df.original.apply(basic_clean)
news_df.head()

Unnamed: 0,title,original,clean
0,Bandhan Bank onboards Sourav Ganguly as brand ...,Bandhan Bank has announced Sourav Ganguly as i...,bandhan bank has announced sourav ganguly as i...
1,Musk is under federal probe over his conduct i...,Twitter has claimed that the world's richest p...,twitter has claimed that the world's richest p...
2,Layoffs will be the absolute last thing at Zoh...,Software startup Zoho's CEO Sridhar Vembu said...,software startup zoho's ceo sridhar vembu said...
3,Infosys let go of employees working for two co...,Infosys CEO Salil Parekh has revealed that the...,infosys ceo salil parekh has revealed that the...
4,Centre announces one-time aide for paddy straw...,Centre said it'll provide one-time financial s...,centre said it'll provide onetime financial su...


In [20]:
news_df['stemmed'] = news_df.clean.apply(stem)
news_df['lemmatized'] = news_df.clean.apply(lemmatize)
news_df.head()

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/Users/vincentbanuelos/nltk_data'
    - '/opt/homebrew/anaconda3/nltk_data'
    - '/opt/homebrew/anaconda3/share/nltk_data'
    - '/opt/homebrew/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
def clean_df(df, extra_words=[], exclude_words=[]):
    df = df[['title','original']]
    
    df['clean'] = df.original\
                        .apply(basic_clean)\
                        .apply(tokenize)\
                        .apply(remove_stopwords, 
                                    extra_words=extra_words,
                                    exclude_words=exclude_words)
    df['stemmed'] = df.clean.apply(stem)
    df['lemmatized'] = df.clean.apply(lemmatize)
    
    return df

In [None]:
news_df_new = news_df.rename(columns={'content':'original'})
codeup_df_new = codeup_df.rename(columns={'content':'original'})

In [None]:
clean_df(news_df_new, extra_words=['infosys'])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"Infosys' attrition drops to 27.1%, net employe...",Infosys on Thursday reported a 1.3% QoQ drop i...,thursday reported 13 qoq drop voluntary attrit...,thursday report 13 qoq drop voluntari attrit r...,thursday reported 13 qoq drop voluntary attrit...
1,We do not support dual employment: Infosys on ...,Infosys CEO Salil Parekh spoke on the moonligh...,ceo salil parekh spoke moonlighting debate ind...,ceo salil parekh spoke moonlight debat industr...,ceo salil parekh spoke moonlighting debate ind...
2,Mukesh Ambani visits Kedarnath & Badrinath shr...,Reliance Industries Chairman Mukesh Ambani on ...,reliance industries chairman mukesh ambani thu...,relianc industri chairman mukesh ambani thursd...,reliance industry chairman mukesh ambani thurs...
3,Centre announces one-time aide for paddy straw...,Centre said it'll provide one-time financial s...,centre said ' provide onetime financial suppor...,centr said ' provid onetim financi support set...,centre said ' provide onetime financial suppor...
4,"IKEA lays off 10,000 employees after halting R...",Swedish ready-to-assemble furniture retailer I...,swedish readytoassemble furniture retailer ike...,swedish readytoassembl furnitur retail ikea sa...,swedish readytoassemble furniture retailer ike...
...,...,...,...,...,...
95,Wonderful experience to work with Rani Mukerji...,"Recalling his Bollywood debut film 'Aiyyaa', M...",recalling bollywood debut film ' aiyyaa ' mala...,recal bollywood debut film ' aiyyaa ' malayala...,recalling bollywood debut film ' aiyyaa ' mala...
96,"Always wanted to be like Big B, he's still goi...",Actor Ranveer Singh dedicated his Lokmat Mahar...,actor ranveer singh dedicated lokmat maharasht...,actor ranveer singh dedic lokmat maharashtrian...,actor ranveer singh dedicated lokmat maharasht...
97,"'Ponniyin...' beats 'Vikram', becomes highest ...",Mani Ratnam's 'Ponniyin Selvan: I' has beaten ...,mani ratnam ' ' ponniyin selvan ' beaten kamal...,mani ratnam ' ' ponniyin selvan ' beaten kamal...,mani ratnam ' ' ponniyin selvan ' beaten kamal...
98,SC questions need for pre-screening committee ...,The Supreme Court questioned the need for a pr...,supreme court questioned need prescreening com...,suprem court question need prescreen committe ...,supreme court questioned need prescreening com...


In [None]:
clean_df(codeup_df)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,What are the Math and Stats Principles You Nee...,"Coming into our Data Science program, you will...",coming into our data science program you will ...,come into our data scienc program you will nee...,coming into our data science program you will ...
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup is excited to launch our first diversit...,codeup is excit to launch our first divers equ...,codeup is excited to launch our first diversit...
2,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio...",codeup strongly values diversity and inclusion...,codeup strongli valu divers and inclus in hono...,codeup strongly value diversity and inclusion ...
3,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...,if you are interested in embarking on a career...,if you are interest in embark on a career in t...,if you are interested in embarking on a career...
4,Why You Should Become a Data Scientist,"What do you look for in a career? Chances are,...",what do you look for in a career chances are y...,what do you look for in a career chanc are you...,what do you look for in a career chance are yo...
5,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...,deciding to transition into a tech career is a...,decid to transit into a tech career is a big s...,deciding to transition into a tech career is a...


### Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?