# Prepare

In [1]:
import pandas as pd

import unicodedata
import re

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk import PorterStemmer, word_tokenize, WordNetLemmatizer
from acquire import get_new_links, get_news_article, get_article_data

### 1) 
Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:  

Lowercase everything  
Normalize unicode characters  
Replace anything that is not a letter, number, whitespace or a single quote.

In [49]:
string = "Dark energy is a mysterious and hypothetical form of energy that is believed to make up a large portion of the total mass-energy content of the universe. It is one of the most significant and perplexing discoveries in modern cosmology and was first proposed to explain the observed accelerated expansion of the universe. cceleration of the Universe: In the late 1990s, astronomers observed that the expansion of the universe is not slowing down due to gravity, as previously thought, but instead accelerating. "

In [3]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string)\
            .encode('ascii', 'ignore')\
            .decode('utf-8', 'ignore')
    string = re.sub(r"[^a-z0-9'\s]", "", string)
    return string

In [4]:
basic_clean(string)

'dark energy is a mysterious and hypothetical form of energy that is believed to make up a large portion of the total massenergy content of the universe it is one of the most significant and perplexing discoveries in modern cosmology and was first proposed to explain the observed accelerated expansion of the universe cceleration of the universe in the late 1990s astronomers observed that the expansion of the universe is not slowing down due to gravity as previously thought but instead accelerating '

### 2) 
Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [5]:
def tokenize(string):
    tokenizer = ToktokTokenizer()
    string = tokenizer.tokenize(string, return_str = True)
    return string

In [6]:
tokenize(string)

'Dark energy is a mysterious and hypothetical form of energy that is believed to make up a large portion of the total mass-energy content of the universe. It is one of the most significant and perplexing discoveries in modern cosmology and was first proposed to explain the observed accelerated expansion of the universe. cceleration of the Universe : In the late 1990s , astronomers observed that the expansion of the universe is not slowing down due to gravity , as previously thought , but instead accelerating.'

### 3) 
Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [7]:
def stem(string):
    stopwords_list = stopwords.words('english')
    words = [word for word in string.split() if word not in stopwords_list]
    new_data = ' '.join(words)
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in words]
    stemmed_data = ' '.join(stems)
    return stemmed_data

In [8]:
stem(string)

'dark energi mysteri hypothet form energi believ make larg portion total mass-energi content universe. it one signific perplex discoveri modern cosmolog first propos explain observ acceler expans universe. cceler universe: in late 1990s, astronom observ expans univers slow due gravity, previous thought, instead accelerating.'

### 4) 
Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(string):
    stopwords_list = stopwords.words('english')
    words = [word for word in string.split() if word not in stopwords_list]
    new_data = ' '.join(words)
    wnl = nltk.stem.WordNetLemmatizer()
    lemmatize = [wnl.lemmatize(word) for word in words]
    lemmatized_data = ' '.join(lemmatize)
    return lemmatized_data

In [10]:
lemmatize(string)

'Dark energy mysterious hypothetical form energy believed make large portion total mass-energy content universe. It one significant perplexing discovery modern cosmology first proposed explain observed accelerated expansion universe. cceleration Universe: In late 1990s, astronomer observed expansion universe slowing due gravity, previously thought, instead accelerating.'

### 5)
Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.  

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [11]:
def remove_stopwords(string):
    stopwords_list = stopwords.words('english')
    words = [word for word in string.split() if word not in stopwords_list]
    new_data = ' '.join(words)
    return new_data

In [12]:
remove_stopwords(string)

'Dark energy mysterious hypothetical form energy believed make large portion total mass-energy content universe. It one significant perplexing discoveries modern cosmology first proposed explain observed accelerated expansion universe. cceleration Universe: In late 1990s, astronomers observed expansion universe slowing due gravity, previously thought, instead accelerating.'

### 6) 
Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [13]:
# Define the URL to the webpage you want to scrape
url = 'https://inshorts.com/'
# Define the User-Agent header to be used in the HTTP request
headers = {'User-Agent': 'Codeup Data Science'}

In [14]:
links = get_new_links(url, headers, 'div', 'a', 'href')

https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
https://blog.inshorts.com/
/en/read
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
https://itunes.apple.com/us/app/news-in-shorts/id892146527
/tnc
/tnc
https://facebook.com/inshortsapp


In [15]:
news_df = get_news_article(links, headers)

## 7) 
Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [16]:
# Define the URL to the webpage you want to scrape
url = 'https://codeup.edu/blog'
# Define the User-Agent header to be used in the HTTP request
headers = {'User-Agent': 'Codeup Data Science'}

In [17]:
links = get_new_links(url, headers, 'h2', 'a', 'href')

https://codeup.edu/featured/apida-heritage-month/
https://codeup.edu/featured/women-in-tech-panelist-spotlight/
https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/
https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/
https://codeup.edu/events/women-in-tech-madeleine/
https://codeup.edu/codeup-news/panelist-spotlight-4/


In [18]:
codeup_df = get_article_data(links, headers)

### 8)
For each dataframe, produce the following columns:

title to hold the title  
original to hold the original article/post content  
clean to hold the normalized and tokenized original with the stopwords removed.  
stemmed to hold the stemmed version of the cleaned data.  
lemmatized to hold the lemmatized version of the cleaned data.  

### news_df

In [47]:
#title to hold the title 
title = news_df['title']

In [48]:
#original to hold the original article/post content 
original = news_df['content']

In [53]:
#clean to hold the normalized and tokenized original with the stopwords removed.  
clean = basic_clean(original)
clean = tokenize(clean)
clean = remove_stopwords(clean)

In [55]:
#stemmed to hold the stemmed version of the cleaned data. 
stemmed = stem(clean)

In [57]:
#lemmatized to hold the lemmatized version of the cleaned data.  
lemmatized = lemmatize(clean)

### codeup_df

In [70]:
df_titles = [item['title'] for item in codeup_df]
for title in df_titles:
    return title

SyntaxError: 'return' outside function (3921353425.py, line 3)