## Data Aquisition Exercises

In [1]:
import requests
import bs4
import os
import pandas as pd

### 1. Scrape the article text from the following pages:
- https://codeup.com/codeups-data-science-career-accelerator-is-here/
- https://codeup.com/data-science-myths/
- https://codeup.com/data-science-vs-data-analytics-whats-the-difference/
- https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/
- https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/

In [2]:
# make the http request and turn the response into a beautiful soup object

url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'}
response = requests.get(url, headers=headers)
html = response.text
soup = bs4.BeautifulSoup(html)

In [3]:
accelerator_div = soup.select('.jupiterx-content')[0]

In [4]:
# title = soup.find('title').text
# title

In [5]:
accelerator_div.find('h1').text

'Codeup’s Data Science Career Accelerator is Here!'

In [6]:
def get_blog_articles(urls, cached=False):
    dict = []
    
    for url in urls:
        headers = {'User-Agent': 'Codeup Data Science'} 
        response = requests.get(url, headers=headers)
        # 
        soup = bs4.BeautifulSoup(response.text)
        website = soup.find('div', class_='jupiterx-post-content')
        
        # creates empty dictionary
        website_dict = {'title':[], 'content':[]}
        # adds title to dictionary
        website_dict['title'] = soup.title.string
        # adds content to dictionary
        website_dict['content'] = website.text
        
        # adds this dict to the url list
        dict.append(website_dict)
        
    # make it a dataframe
    dict = pd.DataFrame(dict)
    
    return dict

In [10]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
        'https://codeup.com/data-science-myths/', 
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/', 
        'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']

codeup_df = get_blog_articles(urls, cached=False)
codeup_df

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


In [53]:
codeup_df.title[3]

'10 Tips to Crush It at the SA Tech Job Fair - Codeup'

### 2 We will now be scraping text data from inshorts, a website that provides a brief overview of many different topics.

In [8]:
url = 'https://inshorts.com/en/read/entertainment'

response = requests.get(url)
response.ok

True

In [11]:
soup = bs4.BeautifulSoup(response.text, 'html.parser')

In [12]:
# Scrape a ResultSet of all the news cards on the page and look at first card

cards = soup.find_all('div', class_='news-card')
print(type(cards))
cards[0]

<class 'bs4.element.ResultSet'>


<div class="news-card z-depth-1" itemscope="" itemtype="http://schema.org/NewsArticle">
<span content="" itemid="https://inshorts.com/en/news/2-designers-pledge-to-never-work-with-kangana-after-twitter-suspends-her-account-1620141915434" itemprop="mainEntityOfPage" itemscope="" itemtype="https://schema.org/WebPage"></span>
<span itemprop="author" itemscope="itemscope" itemtype="https://schema.org/Person">
<span content="Daisy Mowke" itemprop="name"></span>
</span>
<span content="2 designers pledge to never work with Kangana after Twitter suspends her account" itemprop="description"></span>
<span itemprop="image" itemscope="" itemtype="https://schema.org/ImageObject">
<meta content="https://static.inshorts.com/inshorts/images/v1/variants/jpg/m/2021/05_may/4_tue/img_1620140528612_710.jpg?" itemprop="url"/>
<meta content="864" itemprop="width"/>
<meta content="483" itemprop="height"/>
</span>
<span itemprop="publisher" itemscope="itemscope" itemtype="https://schema.org/Organization">
<spa

In [13]:
# Save the title of each news card to list titles

titles = []
for card in cards:
    title = card.find('span', itemprop='headline').text
    titles.append(title)
    
titles[:5]

['2 designers pledge to never work with Kangana after Twitter suspends her account',
 "Pia Bajpiee's brother dies of COVID hrs after her appeal for ventilator bed",
 'YRF requests Maha CM to help them vaccinate 30,000 cine workers',
 "I'm doing well with mild symptoms: Allu Arjun after testing COVID +ve",
 "Permanent relief: Kubbra on Twitter suspending Kangana's account"]

In [14]:
# Save the author of the news card to list authors

authors = []
for card in cards:
    author = card.find('span', class_='author').text
    authors.append(author)
    
authors[:5]

['Daisy Mowke', 'Bulbul Sharma', 'Udit Gupta', 'Udit Gupta', 'Bulbul Sharma']

In [15]:
# Save the text of each article to a list of texts

texts = []
for card in cards:
    text = card.find('div', itemprop='articleBody').text
    texts.append(text)
    
texts[:2]

['After Twitter permanently suspended Kangana Ranaut\'s account, fashion designers Anand Bhushan and Rimzim Dadu issued statements saying they\'re disassociating themselves and their brands from the actress. "We as a brand don\'t support hate speech," said Anand. "We\'re removing all posts of past collaboration with Kangana...and pledge to not engage in any future association with her," wrote Rimzim.',
 'Actress Pia Bajpiee on Tuesday informed that she has lost her brother due to COVID-19. She tweeted the news of his death only hours after she appealed for a ventilator bed for him in the Farrukhabad district of Uttar Pradesh. "I need urgent help...my brother is dying...we are already in mess," Pia wrote at around 7 am. ']

In [16]:
# Create an empty list, articles, to hold the dictionaries for each article
articles = []

# Loop through each news card on the page and get what we want
for card in cards:
    title = card.find('span', itemprop='headline' ).text
    author = card.find('span', class_='author').text
    content = card.find('div', itemprop='articleBody').text
    
    # Create a dictionary, article, for each news card
    article = {'title': title, 'author': author, 'content': content}
    
    # Add the dictionary, article, to our list of dictionaries, articles.
    articles.append(article)

In [17]:
# Here we see our list contains 24 dictionaries for news cards

print(len(articles))
articles[:2]

25


[{'title': '2 designers pledge to never work with Kangana after Twitter suspends her account',
  'author': 'Daisy Mowke',
  'content': 'After Twitter permanently suspended Kangana Ranaut\'s account, fashion designers Anand Bhushan and Rimzim Dadu issued statements saying they\'re disassociating themselves and their brands from the actress. "We as a brand don\'t support hate speech," said Anand. "We\'re removing all posts of past collaboration with Kangana...and pledge to not engage in any future association with her," wrote Rimzim.'},
 {'title': "Pia Bajpiee's brother dies of COVID hrs after her appeal for ventilator bed",
  'author': 'Bulbul Sharma',
  'content': 'Actress Pia Bajpiee on Tuesday informed that she has lost her brother due to COVID-19. She tweeted the news of his death only hours after she appealed for a ventilator bed for him in the Farrukhabad district of Uttar Pradesh. "I need urgent help...my brother is dying...we are already in mess," Pia wrote at around 7 am. '}]

In [18]:
def get_news_articles(cache=False):
    '''
    This function uses a cache parameter with default cache == False to give the option of 
    returning in a df of inshorts topics and info by reading a csv file or
    of doing a fresh scrape of inshort pages with topics business, sports, technology,
    and entertainment and writing the returned df to a csv file.
    '''
    # default to read in a csv instead of scrape for df
    if cache == False:
        df = pd.read_csv('articles.csv', index_col=0)
        
    # cache == True completes a fresh scrape for df    
    else:
    
        # Set base_url and headers that will be used in get request

        base_url = 'https://inshorts.com/en/read/'
        headers = {'User-Agent': 'Codeup Data Science'}
        
        # List of topics to scrape
        topics = ['business', 'sports', 'technology', 'entertainment']

        # Create an empty list, articles, to hold our dictionaries
        articles = []

        for topic in topics:

            # Get a response object from the main inshorts page
            response = requests.get(base_url + topic, headers=headers)

            # Create soup object using response from inshort
            soup = bs4.BeautifulSoup(response.text, 'html.parser')

            # Scrape a ResultSet of all the news cards on the page
            cards = soup.find_all('div', class_='news-card')

            # Loop through each news card on the page and get what we want
            for card in cards:
                title = card.find('span', itemprop='headline' ).text
                author = card.find('span', class_='author').text
                content = card.find('div', itemprop='articleBody').text

                # Create a dictionary, article, for each news card
                article = ({'topic': topic, 
                            'title': title, 
                            'author': author, 
                            'content': content})

                # Add the dictionary, article, to our list of dictionaries, articles.
                articles.append(article)
            
        # return it as a DataFrame
        df = pd.DataFrame(articles)
        
        # write df to csv for future use
        df.to_csv('articles.csv')
    
    return df

In [19]:
# Test our function with cache == True to do a fresh scrape and write to `articles.csv`

news_df = get_news_articles(cache=True)
news_df.head()

Unnamed: 0,topic,title,author,content
0,business,India underestimated the coronavirus: Raghuram...,Kiran Khatri,"Speaking about India's second COVID-19 wave, f..."
1,business,Air India pilots demand vaccination on priorit...,Kiran Khatri,Indian Commercial Pilots Association (ICPA) on...
2,business,World's biggest jeweller says it will no longe...,Kiran Khatri,"Pandora, the world's biggest jeweller, has sai..."
3,business,South Korea's richest woman gets fortune worth...,Anmol Sharma,South Korea’s richest woman Hong Ra-hee added ...
4,business,M&M advances annual maintenance plant shutdown...,Krishna Raj,Mahindra & Mahindra (M&M) said that it has adv...


In [21]:
news_df.topic.value_counts()

entertainment    25
business         25
sports           25
technology       24
Name: topic, dtype: int64

In [49]:
news_df.title[96]

"Mohanlal's 'Drishyam 2' to get a Hindi remake"

## Data Prep Exercises

In [24]:
import numpy as np

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

In [59]:
# initialize a string to use throughout the exercise

string_thing = "A non-capturing version of regular parentheses. Matches whatever regular expression is inside the parentheses, but the substring matched by the group cannot be retrieved after performing a match or referenced later in the pattern."

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it

In [60]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKC', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [61]:
basic_clean(string_thing)

'a noncapturing version of regular parentheses matches whatever regular expression is inside the parentheses but the substring matched by the group cannot be retrieved after performing a match or referenced later in the pattern'

In [27]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

In [62]:
tokenize(string_thing)

'A non-capturing version of regular parentheses. Matches whatever regular expression is inside the parentheses , but the substring matched by the group cannot be retrieved after performing a match or referenced later in the pattern .'

In [29]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [63]:
stem(string_thing)

'A non-captur version of regular parentheses. match whatev regular express is insid the parentheses, but the substr match by the group cannot be retriev after perform a match or referenc later in the pattern.'

In [31]:
# Download the first time.
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stevekane/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [64]:
lemmatize(string_thing)

'A non-capturing version of regular parentheses. Matches whatever regular expression is inside the parentheses, but the substring matched by the group cannot be retrieved after performing a match or referenced later in the pattern.'

In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stevekane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [35]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text
    stopword_list = set(stopword_list) - set(exclude_words)

    # Add in 'extra_words' to stopword_list
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [44]:
remove_stopwords('A non-capturing version of regular parentheses. Matches whatever regular expression is inside the parentheses, but the substring matched by the group cannot be retrieved after performing a match or referenced later in the pattern.', extra_words=['cannot'], exclude_words=['or'])

'A non-capturing version regular parentheses. Matches whatever regular expression inside parentheses, substring matched group retrieved performing match or referenced later pattern.'

In [42]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)\
                            .apply(lemmatize)
    
    df['stemmed'] = df[column].apply(basic_clean).apply(stem)
    
    df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
    
    return df[['title', column, 'clean', 'stemmed', 'lemmatized']]

In [56]:
prep_article_data(codeup_df, 'content')

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is He...,The rumors are true! The time has arrived. Cod...,rumor true time arrived codeup officially open...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...
1,Data Science Myths - Codeup,By Dimitri Antoniou and Maggie Giust\nData Sci...,dimitri antoniou maggie giust data science big...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",dimitri antoniou week ago codeup launched imme...,by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...
3,10 Tips to Crush It at the SA Tech Job Fair - ...,SA Tech Job Fair\nThe third bi-annual San Anto...,sa tech job fair third biannual san antonio te...,sa tech job fair the third biannual san antoni...,sa tech job fair the third biannual san antoni...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamps closing model danger prog...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...
