In [3]:
import pandas as pd
import re
import unicodedata
import numpy as np
import nltk
import acquire

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/olivertaylor/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/olivertaylor/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/olivertaylor/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/olivertaylor/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/olivertaylor/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     /U

True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/olivertaylor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
#function to clean up the article text
def basic_clean(article):
    article = article.lower()
    article = unicodedata.normalize('NFKD', article).encode('ascii', 'ignore').decode('utf-8', 'ignore') #normalize unicode, get rid of weird characters, and then re-encode (turn back into string)
    article = re.sub(r"[^a-z0-9'\s]", '', article)
    return article


In [12]:
#function to tokenize the article
def tokenize(article):
    tokenizer = nltk.tokenize.ToktokTokenizer() # Create tokenizer
    article = tokenizer.tokenize(article, return_str=True) # Use tokenizer
    return article

In [13]:
#function to stem the words in the article
def stem(article):
    ps = nltk.porter.PorterStemmer() # Create stemmer
    stems = [ps.stem(word) for word in article.split()] # loops to Stem each word in the article after splitting
    article_stemmed = ' '.join(stems) # joins the stemmed words back into a string
    return article_stemmed

In [14]:
def lemmatize(string):
    
    wnl = nltk.stem.WordNetLemmatizer() #creating my lemmatizer
    lemmas = [wnl.lemmatize(word) for word in string.split()] #splitting my string into words and applying the lemma
    string = ' '.join(lemmas) #joining back into one string

    return string

In [15]:
#function to remove stopwords
def remove_stopwords(article, extra_words=[], exclude_words=[]):
    # Tokenize the article
    article = tokenize(article)
    words = article.split()
    stopword_list = nltk.corpus.stopwords.words('english')
    # Remove the excluded words from the stopword list
    stopword_list = set(stopword_list) - set(exclude_words)
    # Add in the user specified extra words
    stopword_list = stopword_list.union(set(extra_words))
    filtered_words = [w for w in words if w not in stopword_list]
    article_without_stopwords = ' '.join(filtered_words)
    return article_without_stopwords

In [4]:
topics_list= ['business', 'sports', 'technology', 'entertainment']

In [5]:
#function to get article
articles = acquire.get_news_articles(topics_list)

In [6]:
#change into a dataframe
news_df = pd.DataFrame(articles)
news_df

Unnamed: 0,category,title,content
0,business,"Sreenivasa, who worked at Microsoft & Apple, t...",Google is set to appoint Sreenivasa Reddy as i...
1,business,"What is 'Threads', Meta's Twitter rival?",Mark Zuckerberg-led Meta is set to launch 'Thr...
2,business,China curbs export of key computer chip materials,China has imposed curbs on exports of some pro...
3,business,World's first ammonia-powered car engine unvei...,China's state-owned automobile manufacturer GA...
4,business,Green chilli prices soar to ₹400/kg,The prices of green chilli and ginger have soa...
...,...,...,...
94,entertainment,"CBFC orders 21 cuts in Diljit, Arjun Rampal's ...",CBFC has reportedly granted A certificate and ...
95,entertainment,K3G and DDLJ have spoilt Karwa Chauth for men...,Kajol said that Kabhi Khushi Kabhie Gham and D...
96,entertainment,Was tired but satisfied: Alia on doing 'Tum......,"Alia Bhatt, when asked about her experience of..."
97,entertainment,"Neetu Kapoor shares childhood pic of Kareena, ...",Actress Neetu Kapoor took to Instagram to shar...


In [7]:
blogs = acquire.get_blog_articles()

In [8]:
codeup_df = pd.DataFrame(blogs)
codeup_df

Unnamed: 0,title,link,date_published,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,https://codeup.com/featured/apida-heritage-month/,"May 24, 2023",\nMay is traditionally known as Asian American...
1,Women in tech: Panelist Spotlight – Magdalena ...,https://codeup.com/featured/women-in-tech-pane...,"Mar 28, 2023",\nWomen in tech: Panelist Spotlight – Magdalen...
2,Women in tech: Panelist Spotlight – Rachel Rob...,https://codeup.com/featured/women-in-tech-rach...,"Mar 20, 2023",\nWomen in tech: Panelist Spotlight – Rachel R...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,https://codeup.com/codeup-news/women-in-tech-p...,"Mar 13, 2023",\nWomen in tech: Panelist Spotlight – Sarah Me...
4,Women in Tech: Panelist Spotlight – Madeleine ...,https://codeup.com/events/women-in-tech-madele...,"Mar 6, 2023",\nWomen in tech: Panelist Spotlight – Madelein...
5,Black Excellence in Tech: Panelist Spotlight –...,https://codeup.com/codeup-news/panelist-spotli...,"Feb 16, 2023",\nBlack excellence in tech: Panelist Spotlight...


In [9]:
news_df = news_df.rename(columns={'content':'original'}).drop(columns='category')
news_df.head()

Unnamed: 0,title,original
0,"Sreenivasa, who worked at Microsoft & Apple, t...",Google is set to appoint Sreenivasa Reddy as i...
1,"What is 'Threads', Meta's Twitter rival?",Mark Zuckerberg-led Meta is set to launch 'Thr...
2,China curbs export of key computer chip materials,China has imposed curbs on exports of some pro...
3,World's first ammonia-powered car engine unvei...,China's state-owned automobile manufacturer GA...
4,Green chilli prices soar to ₹400/kg,The prices of green chilli and ginger have soa...


In [16]:
news_df['clean'] = news_df.original.apply(basic_clean).apply(tokenize).apply(remove_stopwords)


In [17]:
news_df['stem'] = news_df.clean.apply(stem)
news_df['lemma'] = news_df.clean.apply(lemmatize)

In [18]:
news_df

Unnamed: 0,title,original,clean,stem,lemma
0,"Sreenivasa, who worked at Microsoft & Apple, t...",Google is set to appoint Sreenivasa Reddy as i...,google set appoint sreenivasa reddy top govern...,googl set appoint sreenivasa reddi top govern ...,google set appoint sreenivasa reddy top govern...
1,"What is 'Threads', Meta's Twitter rival?",Mark Zuckerberg-led Meta is set to launch 'Thr...,mark zuckerbergled meta set launch ' threads '...,mark zuckerbergl meta set launch ' thread ' ju...,mark zuckerbergled meta set launch ' thread ' ...
2,China curbs export of key computer chip materials,China has imposed curbs on exports of some pro...,china imposed curbs exports products made gall...,china impos curb export product made gallium g...,china imposed curb export product made gallium...
3,World's first ammonia-powered car engine unvei...,China's state-owned automobile manufacturer GA...,china ' stateowned automobile manufacturer gac...,china ' stateown automobil manufactur gac said...,china ' stateowned automobile manufacturer gac...
4,Green chilli prices soar to ₹400/kg,The prices of green chilli and ginger have soa...,prices green chilli ginger soared nearly 400 p...,price green chilli ginger soar nearli 400 per ...,price green chilli ginger soared nearly 400 pe...
...,...,...,...,...,...
94,"CBFC orders 21 cuts in Diljit, Arjun Rampal's ...",CBFC has reportedly granted A certificate and ...,cbfc reportedly granted certificate ordered 21...,cbfc reportedli grant certif order 21 cut film...,cbfc reportedly granted certificate ordered 21...
95,K3G and DDLJ have spoilt Karwa Chauth for men...,Kajol said that Kabhi Khushi Kabhie Gham and D...,kajol said kabhi khushi kabhie gham dilwale du...,kajol said kabhi khushi kabhi gham dilwal dulh...,kajol said kabhi khushi kabhie gham dilwale du...
96,Was tired but satisfied: Alia on doing 'Tum......,"Alia Bhatt, when asked about her experience of...",alia bhatt asked experience shooting ' tum kya...,alia bhatt ask experi shoot ' tum kya mile ' s...,alia bhatt asked experience shooting ' tum kya...
97,"Neetu Kapoor shares childhood pic of Kareena, ...",Actress Neetu Kapoor took to Instagram to shar...,actress neetu kapoor took instagram share chil...,actress neetu kapoor took instagram share chil...,actress neetu kapoor took instagram share chil...
