# Data Preparation

In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

## Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
blogs = acquire.get_all_blog_articles()
blogs

Unnamed: 0,title,content
0,From Bootcamp to Bootcamp: Two Military Vetera...,Are you a veteran or active-duty military memb...
1,How to Get Started On Any Programming Exercise,Programming is hard. Whether you’re just begin...
2,The Best Path to a Career in Data Science,"In our blog, “The Best Path To A Career In Sof..."
3,Getting Hired in a Remote Environment,As a career accelerator with a tuition refund ...
4,The Remote Codeup Student Experience,Communities across Texas have now lived in a r...
...,...,...
94,Press Release: Free Learn to Code Bootcamp for...,Press Release: Free Learn to Code Bootcamp for...
95,What The SA Tech Job Fair Says About San Antonio,What The SA Tech Job Fair Says About San Anton...
96,Why Choose Codeup?,Why Choose Codeup?Prospective students sometim...
97,Use Your Texas Unemployment Benefits at Codeup,Use Your Texas Unemployment Benefits at Codeup...


In [3]:
def basic_clean(string):
    string = string.lower()
    string = (unicodedata.normalize('NFKD', string)
                         .encode('ascii', 'ignore')
                         .decode('utf-8', 'ignore')
             )
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    return string

In [4]:
test = blogs.content[0]
test

'Are you a veteran or active-duty military member considering your next steps? Our alumni have been in your boots. In a recent virtual panel, two vets discussed their transition into technology careers with Codeup: Benny Fields III, a retired Air Force Master Sergeant turned Full Stack Web Developer, and Jeffery Roeder, a Navy Intelligence Analyst turned Data Scientist. Whether you’re interested in Data Science or Web Development, here are some key takeaways from the event.\xa0Why Codeup?“The GI Bill was a huge plus, but the icing on the cake was the placement program.” – Benny FieldsAfter retiring from the Air Force, Benny Fields took a job as a technical writer, but he quickly became more interested in the software he was writing about than the writing itself. His friend suggested looking into a coding bootcamp, which he did. He liked that Codeup accepts the GI Bill and the icing on the cake for him was learning about the work our student placement team does to get you hired.What doe

In [5]:
basic_clean(test)

'are you a veteran or activeduty military member considering your next steps our alumni have been in your boots in a recent virtual panel two vets discussed their transition into technology careers with codeup benny fields iii a retired air force master sergeant turned full stack web developer and jeffery roeder a navy intelligence analyst turned data scientist whether youre interested in data science or web development here are some key takeaways from the event why codeupthe gi bill was a huge plus but the icing on the cake was the placement program  benny fieldsafter retiring from the air force benny fields took a job as a technical writer but he quickly became more interested in the software he was writing about than the writing itself his friend suggested looking into a coding bootcamp which he did he liked that codeup accepts the gi bill and the icing on the cake for him was learning about the work our student placement team does to get you hiredwhat does codeups student placement

## Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [6]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [7]:
tokenize(basic_clean(test))

'are you a veteran or activeduty military member considering your next steps our alumni have been in your boots in a recent virtual panel two vets discussed their transition into technology careers with codeup benny fields iii a retired air force master sergeant turned full stack web developer and jeffery roeder a navy intelligence analyst turned data scientist whether youre interested in data science or web development here are some key takeaways from the event why codeupthe gi bill was a huge plus but the icing on the cake was the placement program benny fieldsafter retiring from the air force benny fields took a job as a technical writer but he quickly became more interested in the software he was writing about than the writing itself his friend suggested looking into a coding bootcamp which he did he liked that codeup accepts the gi bill and the icing on the cake for him was learning about the work our student placement team does to get you hiredwhat does codeups student placement 

## Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [8]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    return ' '.join(stems)

In [9]:
stem(tokenize(basic_clean(test)))

'are you a veteran or activeduti militari member consid your next step our alumni have been in your boot in a recent virtual panel two vet discuss their transit into technolog career with codeup benni field iii a retir air forc master sergeant turn full stack web develop and jefferi roeder a navi intellig analyst turn data scientist whether your interest in data scienc or web develop here are some key takeaway from the event whi codeupth gi bill wa a huge plu but the ice on the cake wa the placement program benni fieldsaft retir from the air forc benni field took a job as a technic writer but he quickli becam more interest in the softwar he wa write about than the write itself hi friend suggest look into a code bootcamp which he did he like that codeup accept the gi bill and the ice on the cake for him wa learn about the work our student placement team doe to get you hiredwhat doe codeup student placement team dotheyl give you everi imagin tool to get place they have ton of connect it 

## Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [10]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    return ' '.join(lemmas)

In [11]:
lemmatize(tokenize(basic_clean(test)))

'are you a veteran or activeduty military member considering your next step our alumnus have been in your boot in a recent virtual panel two vet discussed their transition into technology career with codeup benny field iii a retired air force master sergeant turned full stack web developer and jeffery roeder a navy intelligence analyst turned data scientist whether youre interested in data science or web development here are some key takeaway from the event why codeupthe gi bill wa a huge plus but the icing on the cake wa the placement program benny fieldsafter retiring from the air force benny field took a job a a technical writer but he quickly became more interested in the software he wa writing about than the writing itself his friend suggested looking into a coding bootcamp which he did he liked that codeup accepts the gi bill and the icing on the cake for him wa learning about the work our student placement team doe to get you hiredwhat doe codeups student placement team dotheyll

## Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

## This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [12]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')
    
    for word in extra_words:
        stopword_list.append(word)
    
    for word in exclude_words:
        stopword_list.remove(word)
        
    words = string.split()
    filtered_words = [word for word in words if word not in stopword_list]
    return ' '.join(filtered_words)

In [13]:
remove_stopwords(lemmatize(tokenize(basic_clean(test))))

'veteran activeduty military member considering next step alumnus boot recent virtual panel two vet discussed transition technology career codeup benny field iii retired air force master sergeant turned full stack web developer jeffery roeder navy intelligence analyst turned data scientist whether youre interested data science web development key takeaway event codeupthe gi bill wa huge plus icing cake wa placement program benny fieldsafter retiring air force benny field took job technical writer quickly became interested software wa writing writing friend suggested looking coding bootcamp liked codeup accepts gi bill icing cake wa learning work student placement team doe get hiredwhat doe codeups student placement team dotheyll give every imaginable tool get placed ton connection crazy college arent gonna jeff roederwell buff resume set mock interview give knowhow nail interview get job offer dress say thank letter jeff said best well give every imaginable tool get placed new career b

## Define a function named prepare_article_data that takes in the list of articles, applies the prep_article function to each one, and returns the transformed data.

In [14]:
blogs

Unnamed: 0,title,content
0,From Bootcamp to Bootcamp: Two Military Vetera...,Are you a veteran or active-duty military memb...
1,How to Get Started On Any Programming Exercise,Programming is hard. Whether you’re just begin...
2,The Best Path to a Career in Data Science,"In our blog, “The Best Path To A Career In Sof..."
3,Getting Hired in a Remote Environment,As a career accelerator with a tuition refund ...
4,The Remote Codeup Student Experience,Communities across Texas have now lived in a r...
...,...,...
94,Press Release: Free Learn to Code Bootcamp for...,Press Release: Free Learn to Code Bootcamp for...
95,What The SA Tech Job Fair Says About San Antonio,What The SA Tech Job Fair Says About San Anton...
96,Why Choose Codeup?,Why Choose Codeup?Prospective students sometim...
97,Use Your Texas Unemployment Benefits at Codeup,Use Your Texas Unemployment Benefits at Codeup...


In [22]:
def prepare_article_data(df, column):
    clean_tokens = df[column].apply(basic_clean).apply(tokenize)
    df['stemmed'] = clean_tokens.apply(stem)
    df['lemmatized'] = clean_tokens.apply(lemmatize)
    df['clean'] = clean_tokens.apply(remove_stopwords)
    return df

In [23]:
prepare_article_data(blogs, 'content')

Unnamed: 0,title,content,stemmed,lemmatized,clean
0,From Bootcamp to Bootcamp: Two Military Vetera...,Are you a veteran or active-duty military memb...,are you a veteran or activeduti militari membe...,are you a veteran or activeduty military membe...,veteran activeduty military member considering...
1,How to Get Started On Any Programming Exercise,Programming is hard. Whether you’re just begin...,program is hard whether your just begin to lea...,programming is hard whether youre just beginni...,programming hard whether youre beginning learn...
2,The Best Path to a Career in Data Science,"In our blog, “The Best Path To A Career In Sof...",in our blog the best path to a career in softw...,in our blog the best path to a career in softw...,blog best path career software development loo...
3,Getting Hired in a Remote Environment,As a career accelerator with a tuition refund ...,as a career acceler with a tuition refund guar...,a a career accelerator with a tuition refund g...,career accelerator tuition refund guarantee al...
4,The Remote Codeup Student Experience,Communities across Texas have now lived in a r...,commun across texa have now live in a remot en...,community across texas have now lived in a rem...,communities across texas lived remote environm...
...,...,...,...,...,...
94,Press Release: Free Learn to Code Bootcamp for...,Press Release: Free Learn to Code Bootcamp for...,press releas free learn to code bootcamp for v...,press release free learn to code bootcamp for ...,press release free learn code bootcamp veteran...
95,What The SA Tech Job Fair Says About San Antonio,What The SA Tech Job Fair Says About San Anton...,what the sa tech job fair say about san antoni...,what the sa tech job fair say about san antoni...,sa tech job fair says san antonio last night c...
96,Why Choose Codeup?,Why Choose Codeup?Prospective students sometim...,whi choos codeupprospect student sometim ask a...,why choose codeupprospective student sometimes...,choose codeupprospective students sometimes as...
97,Use Your Texas Unemployment Benefits at Codeup,Use Your Texas Unemployment Benefits at Codeup...,use your texa unemploy benefit at codeupelig a...,use your texas unemployment benefit at codeupe...,use texas unemployment benefits codeupeligible...
