# Data Acquisition

In [6]:
from requests import get
from bs4 import BeautifulSoup
import os

In [7]:
# looking at an article from Codeup's blog
url = 'https://codeup.com/data-science/math-in-data-science/'
headers = {'User-Agent': 'Codeup Data Science'} # some website don't accept the python-request
response = get(url, headers=headers)

In [9]:
# perform a quick check to make sure we are looking at html 
print(response.text[:400])

<!DOCTYPE html>
<html lang="en-US">
<head>
	<meta charset="UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge">
	<link rel="pingback" href="https://codeup.edu/xmlrpc.php" />

	<script type="text/javascript">
		document.documentElement.className = 'js';
	</script>
	
	<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin /><script id="diviarea-loader">window.DiviPopupData=wi


In [16]:
# look at the website and inspect its HTML tag and identify the content; class and id
soup = BeautifulSoup(response.content, 'html.parser')

##### beautiful soup methods and properties

* soup.title.string gets the page's title (the same text in the browser tab for a page, this is the < title> element
  
* soup.prettify() is useful to print in case you want to see the HTML
    
* soup.find_all("a") find all the anchor tags, or whatever argument is specified.
    
* soup.find("h1") finds the first matching element
    
* soup.get_text() gets the text from within a matching piece of soup/HTML
    
* The soup.select() method takes in a CSS selector as a string and returns all matching elements.

see also `soup.find_all`

beautiful soup uses `class_` as the keyword argument for searching for a class because `class` is a reserved word in python

we'll use the class name that we identified from looking in the inspector in chrome

In [18]:
article = soup.find('div', id='main-content')
article

<div id="main-content">
<div class="container">
<div class="clearfix" id="content-area">
<div id="left-area">
<article class="et_pb_post post-6579 post type-post status-publish format-standard has-post-thumbnail hentry category-data-science" id="post-6579">
<div class="et_post_meta_wrapper">
<h1 class="entry-title">What are the Math and Stats Principles You Need for Data Science?</h1>
<p class="post-meta"><span class="published">Oct 21, 2020</span> | <a href="https://codeup.edu/category/data-science/" rel="category tag">Data Science</a></p><img alt="Is there math in data science? Read our blog to learn how much math you need!" class="" height="675" sizes="(min-width: 0px) and (max-width: 480px) 480px, (min-width: 481px) and (max-width: 980px) 980px, (min-width: 981px) 1080px, 100vw" src="https://codeup.edu/wp-content/uploads/2020/10/Blog_MathStatsReg_1200x628-1-1080x628.png" srcset="https://codeup.edu/wp-content/uploads/2020/10/Blog_MathStatsReg_1200x628-1-1080x628.png 1080w, https://c

In [20]:
# store article text for future use 
with open('article.txt', 'w') as f: #this create a file name article.txt in write mode ('w')
    f.write(article.text)   #write the text content of the article to file 

##### now we will wrap all the code above into a function in acquire.py

# Data Preparation

lower all text | remove non-ASCII characters | remove special characters | stem or lemmatize the words | Remove stopwords | store the clean text for future use 

In [24]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lordvoldemort/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
import unicodedata
import re
import json
import nltk

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

#import acquire if acquire exist

In [27]:
article = "Coming into our Data Science program, you will need to know some math and \
stats. However, many of our applicants actually learn in the application process – you \
don’t need to be an expert before applying! Data science is a very accessible field to \
anyone dedicated to learning new skills, and we can work with any applicant to help them \
learn what they need to know. But what “skills” do we mean, exactly? Just what exactly \
are the data science math and stats principles you need to know?', 'What are the main \
math principles you need to know to get into Codeup’s Data Science program?'"
article

"Coming into our Data Science program, you will need to know some math and stats. However, many of our applicants actually learn in the application process – you don’t need to be an expert before applying! Data science is a very accessible field to anyone dedicated to learning new skills, and we can work with any applicant to help them learn what they need to know. But what “skills” do we mean, exactly? Just what exactly are the data science math and stats principles you need to know?', 'What are the main math principles you need to know to get into Codeup’s Data Science program?'"

In [28]:
# convert text to all lower case for normalcy
article.lower()

"coming into our data science program, you will need to know some math and stats. however, many of our applicants actually learn in the application process – you don’t need to be an expert before applying! data science is a very accessible field to anyone dedicated to learning new skills, and we can work with any applicant to help them learn what they need to know. but what “skills” do we mean, exactly? just what exactly are the data science math and stats principles you need to know?', 'what are the main math principles you need to know to get into codeup’s data science program?'"

In [32]:
#remove accented character
# 1. unicodedata.normalize removes any inconsistencies in unicode character encoding.
# 2. .encode to convert the resulting string to the ASCII character set. We'll ignore any errors in conversion, meaning we'll drop anything that isn't an ASCII character.
# 3. .decode to turn the resulting bytes object back into a string.

article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
article[0:100]

'Coming into our Data Science program, you will need to know some math and stats. However, many of ou'

In [35]:
# removing special characters that is not a-z, a number, a single quote, or whitespace
article = re.sub(r"[^a-z0-9'\s]",'', article)

In [37]:
# Tokenization by breaking the words or punctuation into discrete units 
tokenizer = nltk.tokenize.ToktokTokenizer()
tokenizer.tokenize(article,return_str=True)[0:100]

'oming into our ata cience program you will need to know some math and stats owever many of our appli'

In [44]:
# stemming ; may not produce the right english 
ps = nltk.porter.PorterStemmer()
ps.stem(article)[:100]

'oming into our ata cience program you will need to know some math and stats owever many of our appli'

In [49]:
# use the code above to transform all in the article
stems = [ps.stem(word) for word in article.split()]
article_stemmed=' '.join(stems)
article_stemmed[:100]

'ome into our ata cienc program you will need to know some math and stat owev mani of our applic actu'

In [51]:
# count how many was stemmed
pd.Series(stems).value_counts().head(100)

to          9
need        5
you         4
learn       3
and         3
           ..
veri        1
access      1
field       1
anyon       1
program'    1
Length: 63, dtype: int64

In [53]:
# lemmatization is slower but produces english words
wnl = nltk.stem.WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."

for word in sentence.split():
    print('stem:', ps.stem(word), '-- lemma:', wnl.lemmatize(word))


stem: he -- lemma: He
stem: wa -- lemma: wa
stem: run -- lemma: running
stem: and -- lemma: and
stem: eat -- lemma: eating
stem: at -- lemma: at
stem: same -- lemma: same
stem: time. -- lemma: time.
stem: he -- lemma: He
stem: ha -- lemma: ha
stem: bad -- lemma: bad
stem: habit -- lemma: habit
stem: of -- lemma: of
stem: swim -- lemma: swimming
stem: after -- lemma: after
stem: play -- lemma: playing
stem: long -- lemma: long
stem: hour -- lemma: hour
stem: in -- lemma: in
stem: the -- lemma: the
stem: sun. -- lemma: Sun.


In [59]:
# apply lemmatization to the entire documents 
lemmas = [wnl.lemmatize(word) for word in article.split()]
article_lemmatized = ' '.join(lemmas)

article_lemmatized[0:100]

'oming into our ata cience program you will need to know some math and stats owever many of our appli'

In [60]:
pd.Series(lemmas).value_counts()[:10]

to         9
need       5
you        4
ata        3
the        3
and        3
what       3
math       3
know       3
exactly    2
dtype: int64

In [61]:
# removing stopwords 


# Data Exploration

# Modeling