In [141]:
#standard imports
import pandas as pd
import numpy as np
import unicodedata
import re
import nltk
from bs4 import BeautifulSoup
import requests

In [2]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

## 1 Function: basic_clean

### lowercase

In [4]:
lowered = original.lower()

In [5]:
lowered

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

### normalize into unicode (utf-8)

In [6]:
normalized = unicodedata.normalize('NFKD', lowered).encode('ascii','ignore').decode('utf-8')

In [7]:
normalized

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### replace non-word, single space and single-quote chars

In [8]:
re.sub(r'[^a-zA-Z0-9\'\s]', '', normalized)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [23]:
basicly_clean = re.sub(r'[^a-zA-Z0-9\'\s]', '', normalized)

In [24]:
basicly_clean

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### now functionalize this...

In [11]:
def basic_clean(string):
    '''
    takes in a string and outputs a basic-cleaned version:
                    -lowercase
                    -normalized to unicode set
                    -replaced non-word and non-singlespace,non-singlequote chars with ''
    '''
    lowered = string.lower()
    normalized = unicodedata.normalize('NFKD', lowered).encode('ascii','ignore').decode('utf-8')
    basic_clean = re.sub(r'[^a-zA-Z0-9\'\s]', '', normalized)
    return basic_clean

In [17]:
test = 'here sis some wsrttroing of sa./23;l3r2892390*(*()*\U0001F601())'

In [18]:
test

'here sis some wsrttroing of sa./23;l3r2892390*(*()*😁())'

In [28]:
test_clean = basic_clean(test)

In [29]:
test_clean

'here sis some wsrttroing of sa23l3r2892390'

## 2 Function: tokenize

In [20]:
tokenizer = nltk.tokenize.ToktokTokenizer()
tokenizer

<nltk.tokenize.toktok.ToktokTokenizer at 0x14c248940>

In [25]:
tokenized = tokenizer.tokenize(basicly_clean, return_str=True)

In [26]:
tokenized

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### now functionalize this...

In [31]:
def tokenize(string):
    '''
    takes in a string and outputs a tokenized version:
    
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    tokenized = tokenizer.tokenize(string, return_str=True)
    return tokenized

In [39]:
test_token = tokenize(test_clean)

In [40]:
test_token

'here sis some wsrttroing of sa23l3r2892390'

## 3 Function: stem

In [33]:
stemmer = nltk.porter.PorterStemmer()

In [34]:
stemmer

<PorterStemmer>

In [35]:
stemmer.stem(tokenized)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necess"

In [36]:
' '.join([stemmer.stem(word) for word in tokenized.split()])

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### now functionalize this...

In [37]:
def stem(string):
    '''
    takes in a string and outputs a stemmed (Porter) version:
    
    '''
    stemmer = nltk.porter.PorterStemmer()
    stemmed = ' '.join([stemmer.stem(word) for word in string.split()])
    return stemmed

In [41]:
stem(test_token)

'here si some wsrttro of sa23l3r2892390'

## 4 Function: lemmatize

In [42]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [43]:
lemmatized = [lemmatizer.lemmatize(word) for word in tokenized.split()]

In [44]:
lemmatized

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdos',
 "'",
 's',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

### now functionalize this...

In [45]:
def lemmatize(string):
    '''
    takes in a string and outputs a lemmatized version:
    
    '''
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in string.split()]
    return lemmatized

In [54]:
test_lemma = lemmatize(test_token)

In [55]:
test_lemma

['here', 'si', 'some', 'wsrttroing', 'of', 'sa23l3r2892390']

## 5 Function: remove_stopwords

In [47]:
stopwords = nltk.corpus.stopwords

In [48]:
stopwords

<WordListCorpusReader in '/Users/richardalcabes/nltk_data/corpora/stopwords'>

In [49]:
stopwords.words('english')[:20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [93]:
garbage = ['words','word','wording']

In [108]:
stopper = stopwords.words('english')

In [109]:
type(stopper)

list

In [113]:
type(stopper.append('ws'))

NoneType

In [111]:
type(stopper2)

NoneType

In [50]:
dunzo_text = ' '.join([word for word in lemmatized 
                         if word not in stopwords.words('english')])

In [51]:
dunzo_text

"paul erdos george polya influential hungarian mathematician contributed lot field erdos ' name contains hungarian letter ' ' ' ' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

### now functionalize this...

In [137]:
def remove_stopwords(token_list,extra=[]):
    '''
    takes in a list of tokens and outputs a version with standard english stopwords removed
    additional arguments 'extra_words'  *list of strings* allow user to add to stopword dictionary
                         'exclude_words' *list of strings* allow user to remove word from standard stopword dict
    
    '''
    stopwords = nltk.corpus.stopwords
    stopw = stopwords.words('english') + extra
    dunzo_text = ' '.join([word for word in token_list 
                         if word not in stopw])
    return dunzo_text

In [140]:
remove_stopwords(test_lemma,extra=['si','wsrttroing'])

'sa23l3r2892390'

## 6 acquire news_df

In [146]:
url = 'https://inshorts.com/en/read/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
news_div = soup.find_all('div',class_='news-card-content news-right-box')

In [149]:
content = []
for i in range(0,len(news_div)):
    url= 'https://inshorts.com/en/read/'
    #headers = {'User-Agent': 'Codeup Data Science'}
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    news_div = soup.find_all('div',class_='news-card-content news-right-box')
    gacha = news_div[i].text
    content.append(gacha)

In [151]:
titles = []
for i in range(0,len(news_div)):
    url= 'https://inshorts.com/en/read/'
    #headers = {'User-Agent': 'Codeup Data Science'}
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    headline = soup.find_all('span',itemprop='headline')
    gacha = headline[i].text
    titles.append(gacha)

In [152]:
dictionary = {titles[i]:content[i] for i in range(len(news_div))}