In [1]:
import pandas as pd
import numpy as np
import unicodedata
import re
import json
import acquire as a
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [2]:
text = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

# 1.  Lowercase function

In [3]:
def basic_clean(words):
    
    words = words.lower()
    
    return words

In [4]:
text = basic_clean(text)
text

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

# 2. Tokenize

In [5]:
def tokenize(words):
    
    tokenize = nltk.tokenize.ToktokTokenizer()
    
    words = tokenize.tokenize(words, return_str = True)
    
    return words

In [6]:
text = tokenize(text)
text

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős ' s name contains the hungarian letter ' ő ' ( ' o ' with double acute accent ) , but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

# 3. Stemming

In [7]:
def stem(words):
    
    ps = nltk.porter.PorterStemmer()
    
    stemmed_words = [ps.stem(word) for word in words.split()]
    
    new_text = ' '.join(stemmed_words)
    
    return new_text

In [8]:
text = stem(text)
text

"paul erdő and georg pólya were influenti hungarian mathematician who contribut a lot to the field. erdő ' s name contain the hungarian letter ' ő ' ( ' o ' with doubl acut accent ) , but is often incorrectli written as erdo or erdö either by mistak or out of typograph necess"

# 4. Lemmatize

In [9]:
def lemmatize(words):
    
    wnl = nltk.stem.WordNetLemmatizer()
    
    lemmas = [wnl.lemmatize(word) for word in words.split()]
    
    new_text = ' '.join(lemmas)
    
    return new_text

In [10]:
text = lemmatize(text)
text

"paul erdő and georg pólya were influenti hungarian mathematician who contribut a lot to the field. erdő ' s name contain the hungarian letter ' ő ' ( ' o ' with doubl acut accent ) , but is often incorrectli written a erdo or erdö either by mistak or out of typograph necess"

# 5. Remove Stopwords

In [11]:
def remove_stopwords(x):
    
    stopword_list = stopwords.words('english')
    
    x = [word for word in x.split() if word not in stopword_list]
    
    new_text = ' '.join(x)
    
    return new_text

In [12]:
def remove_stopwords(x, extra_words = [], exclude_words = []):

    stopword_list = stopwords.words('english')

    stopword_list = set(stopword_list) - set(exclude_words)

    stopword_list = stopword_list.union(set(extra_words))

    words = x.split()

    filtered_words = [word for word in words if word not in stopword_list]

    string_without_stopwords = ' '.join(filtered_words)

    return string_without_stopwords

In [13]:
text = remove_stopwords(text)
text

"paul erdő georg pólya influenti hungarian mathematician contribut lot field. erdő ' name contain hungarian letter ' ő ' ( ' ' doubl acut accent ) , often incorrectli written erdo erdö either mistak typograph necess"

# 6.

In [14]:
news_df = a.get_news_articles()
news_df

[{'category': 'business',
  'title': 'Bill Gates meets Ratan Tata, N Chandrasekaran; pics surface',
  'content': 'Microsoft Co-founder Bill Gates met with Tata Sons Chairman Emeritus Ratan Tata and Tata Sons Chairman Natarajan Chandrasekaran. "Bill had an enriching discussion with Ratan Tata and N Chandrasekaran about their philanthropic initiatives," Gates Foundation India said in a tweet. "We look forward to strengthening our work together & partnering for health, diagnostics, and nutrition," it added.'},
 {'category': 'business',
  'title': 'SoftBank sells shares worth ₹954 crore in logistics firm Delhivery',
  'content': "SoftBank sold shares worth ₹954 crore in logistics company Delhivery in a bulk deal at ₹340/piece, BSE data showed. With Wednesday's sale of 2.8 crore shares, SoftBank has pared down its stake in Delhivery to around 14%. SoftBank had invested around ₹3,100 crore in Delhivery, offloaded shares worth ₹618 crore in its IPO and held over 18% stake in December-end."},


In [15]:
news_df = pd.DataFrame(news_df)
news_df

Unnamed: 0,category,title,content
0,business,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...
1,business,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...
2,business,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...
3,business,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...
4,business,Adani denies report of securing $3 bn from sov...,Adani Group has denied a report which claimed ...
...,...,...,...
95,entertainment,Sometimes truth does more harm than a lie in r...,"Actor Ranbir Kapoor, speaking about relationsh..."
96,entertainment,Actors today focus on gym and social media mor...,Actor Govind Namdev has said that the current ...
97,entertainment,It takes guts: Rajatava on Akshay accepting fa...,Bengali actor Rajatava Dutta praised Akshay Ku...
98,entertainment,I want my character in 'Zwigato' to be appreci...,"Actor-comedian Kapil Sharma, speaking about hi..."


# 7.

In [18]:
codeup_df = a.get_blogs()
codeup_df


[{'title': 'Black Excellence in Tech: Panelist Spotlight – Wilmarie De La Cruz Mejia',
  'content': 'Black excellence in tech: Panelist Spotlight – Wilmarie De La Cruz Mejia\n\nCodeup is hosting a Black Excellence in Tech Panel in honor of Black History Month on February 22, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as black leaders in the tech industry!\xa0\xa0\nMeet Wilmarie!\nWilmarie De\xa0La Cruz Mejia is a current Codeup student on the path to becoming a Full-Stack Web Developer at our Dallas, TX campus.\xa0\nWilmarie is a veteran expanding her knowledge of programming languages and technologies on her journey with Codeup.\xa0\nWe asked Wilmarie to share more about her experience at Codeup. She shares, “I was able to meet other people who were passionate about coding and be in a positive learning environment.”\nWe hope you can join us on February 22nd to sit in on an insi

In [19]:
codeup_df = pd.DataFrame(codeup_df)
codeup_df

Unnamed: 0,title,content
0,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
1,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
2,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
3,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
4,Coding Bootcamp or Self-Learning? Which is Bes...,If you’re interested in embarking on a career ...
5,Codeup Among Top 58 Best Coding Bootcamps of 2023,Codeup is pleased to announce we have been ran...


In [20]:
news_df.rename(columns={'content': 'original'}, inplace = True)
codeup_df.rename(columns={'content': 'original'}, inplace = True)

In [21]:
news_df

Unnamed: 0,category,title,original
0,business,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...
1,business,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...
2,business,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...
3,business,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...
4,business,Adani denies report of securing $3 bn from sov...,Adani Group has denied a report which claimed ...
...,...,...,...
95,entertainment,Sometimes truth does more harm than a lie in r...,"Actor Ranbir Kapoor, speaking about relationsh..."
96,entertainment,Actors today focus on gym and social media mor...,Actor Govind Namdev has said that the current ...
97,entertainment,It takes guts: Rajatava on Akshay accepting fa...,Bengali actor Rajatava Dutta praised Akshay Ku...
98,entertainment,I want my character in 'Zwigato' to be appreci...,"Actor-comedian Kapil Sharma, speaking about hi..."


In [22]:
codeup_df

Unnamed: 0,title,original
0,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
1,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
2,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
3,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...
4,Coding Bootcamp or Self-Learning? Which is Bes...,If you’re interested in embarking on a career ...
5,Codeup Among Top 58 Best Coding Bootcamps of 2023,Codeup is pleased to announce we have been ran...


In [23]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [24]:
prep_article_data(news_df, 'original', extra_words= ['ha'], exclude_words= ['no'])

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...,microsoft co-founder bill gates met tata sons ...,microsoft co-found bill gate met tata son chai...,microsoft co-founder bill gate met tata son ch...
1,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...,softbank sold shares worth ₹ 954 crore logisti...,softbank sold share worth ₹ 954 crore logist c...,softbank sold share worth ₹ 954 crore logistic...
2,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...,hours central government raised price commerci...,hour central govern rais price commerci lpg cy...,hour central government raised price commercia...
3,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...,indian-americans punit renjen rajesh subramani...,indian-american punit renjen rajesh subramania...,indian-americans punit renjen rajesh subramani...
4,Adani denies report of securing $3 bn from sov...,Adani Group has denied a report which claimed ...,adani group denied report claimed secured $ 3 ...,adani group deni report claim secur $ 3 billio...,adani group denied report claimed secured $ 3 ...
...,...,...,...,...,...
95,Sometimes truth does more harm than a lie in r...,"Actor Ranbir Kapoor, speaking about relationsh...","actor ranbir kapoor , speaking relationships ,...","actor ranbir kapoor , speak relationship , sai...","actor ranbir kapoor , speaking relationship , ..."
96,Actors today focus on gym and social media mor...,Actor Govind Namdev has said that the current ...,actor govind namdev said current actors focuse...,actor govind namdev said current actor focus g...,actor govind namdev said current actor focused...
97,It takes guts: Rajatava on Akshay accepting fa...,Bengali actor Rajatava Dutta praised Akshay Ku...,bengali actor rajatava dutta praised akshay ku...,bengali actor rajatava dutta prais akshay kuma...,bengali actor rajatava dutta praised akshay ku...
98,I want my character in 'Zwigato' to be appreci...,"Actor-comedian Kapil Sharma, speaking about hi...","actor-comedian kapil sharma , speaking charact...","actor-comedian kapil sharma , speak charact ' ...","actor-comedian kapil sharma , speaking charact..."


In [25]:
prep_article_data(codeup_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()


Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Black Excellence in Tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,black excellence tech : panelist spotlight – w...,black excel tech : panelist spotlight – wilmar...,black excellence tech : panelist spotlight – w...
1,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,black excellence tech : panelist spotlight – s...,black excel tech : panelist spotlight – stepha...,black excellence tech : panelist spotlight – s...
2,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,black excellence tech : panelist spotlight – j...,black excel tech : panelist spotlight – jame c...,black excellence tech : panelist spotlight – j...
3,Black excellence in tech: Panelist Spotlight –...,Black excellence in tech: Panelist Spotlight –...,black excellence tech : panelist spotlight – j...,black excel tech : panelist spotlight – jeanic...,black excellence tech : panelist spotlight – j...
4,Coding Bootcamp or Self-Learning? Which is Bes...,If you’re interested in embarking on a career ...,"’ interested embarking career tech , likely ta...","’ interest embark career tech , like taken loo...","’ interested embarking career tech , likely ta..."


# 9. 

* If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    * I would used lemmatized as lemmatized can take longer but is more refined


* If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    * I would used stemmed due to the size, I would want to chop down a lot in a small amount of time, if it wouldn't take very long I owuld use lemmatized


* If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    * stemmed all day