In [1]:
import pandas as pd

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import acquire

In [2]:
# clean the text, remove text that is not condition sets in the exercise
# lower all text letters
def basic_clean(text):
    text = text.lower()
    text = (unicodedata.normalize('NFKD', text)
                         .encode('ascii', 'ignore')
                         .decode('utf-8', 'ignore')
             )
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    return text

In [3]:
# tokenize the text
def tokenize(text):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(text, return_str=True)

In [4]:
# stemming the text
def stem(text):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    return ' '.join(stems)

In [5]:
# lemmanize the text
def lemma(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    return ' '.join(lemmas)

In [6]:
# remove the stop words
def remove_stopwords(text, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')
    
    for word in extra_words:
        stopword_list.append(word)
    
    for word in exclude_words:
        stopword_list.remove(word)
        
    words = text.split()
    filtered_words = [word for word in words if word not in stopword_list]
    return ' '.join(filtered_words)

In [7]:
def prepare_data(df,column):
    clean_tokens = (df[column].apply(basic_clean)
                              .apply(tokenize)
                              .apply(remove_stopwords)
                   )
    
    for token in clean_tokens:
        token = ' '.join(token).split()
        
    df['clean']=clean_tokens
    df['original']=df[column].copy()
    df['stemmed'] = clean_tokens.apply(stem)
    df['lemmatized'] = clean_tokens.apply(lemma)
    return df

In [8]:
news_df=acquire.get_news_articles()

In [9]:
news_df

Unnamed: 0,title,content,category
0,Business News: World and India Business News i...,[Red Bull Co-founder Dietrich Mateschitz has p...,business
1,Business News: World and India Business News i...,[Bengaluru-based IT services firm Happiest Min...,business
2,Business News: World and India Business News i...,[Reacting to a report on Unacademy's losses ju...,business
3,Business News: World and India Business News i...,"[After a report, while citing employees, calle...",business
4,Business News: World and India Business News i...,[Ola CEO Bhavish Aggarwal at the company's rec...,business
5,Business News: World and India Business News i...,"[A plane carrying businessman Rainer Schaller,...",business
6,Business News: World and India Business News i...,"[N Ranga Rao, the Founder of Cycle Brand Agarb...",business
7,Business News: World and India Business News i...,[Taking to Twitter to congratulate Virat Kohli...,business
8,Business News: World and India Business News i...,[Google has removed 16 apps from Play Store wh...,business
9,Business News: World and India Business News i...,[Netflix has decided to open its fifth gaming ...,business


In [10]:
prepare_data(news_df,'content')

TypeError: normalize() argument 2 must be str, not list

In [None]:
codeup_df=acquire.get_blog_articles()

In [None]:
codeup_df

In [None]:
prepare_data(codeup_df,'content')