# Text Cleaning

In [1]:
# In this challenge we are keeping things relatively simple so you only need to clean up special characters, numbers, 
# and URLs. #Let's say you have the following messy string to clean up:

# @Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")
# You will write a function, which will be part of you NLP analysis pipeline in the next challenge, 
# to clean up strings like #above and output:

# ironhack s  q website  is
# In the cell below, write a function called clean_up. Test your function with the above string and make sure you receive
# the expected output.

In [2]:
import re 
import nltk
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 

In [3]:
nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pbsil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pbsil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pbsil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
text = '@Ironhack\'s-#Q website 776-is http://ironhack.com [(2018)]")'

In [5]:

def clean_up(c):
    
    lower = c.lower()
    no_url = re.sub('http\S+', '',lower)
    string_list = re.findall('[a-z]+',no_url)

    return ' '.join(string_list)

In [6]:
cleaned_up = clean_up(text)

In [7]:
cleaned_up

'ironhack s q website is'

# Tokenization

In [8]:
#We have actually discussed the concept of tokenization in the Bag of Words lab before. In that lab, we did both tokenization
#and calculated the matrix of document-term frequency. In this lab, we only need tokenization.

#In the cell below, write a function called tokenize to convert a string to a list of words. We'll use the string we received 
# in the previous step ironhack s  q website  is to test your function. Your function shoud return:

#['ironhack', 's', 'q', 'website', 'is']

In [9]:
def tokenize(t):
    return nltk.word_tokenize(t)
    
sentence = clean_up(text)

tokenize(sentence)

['ironhack', 's', 'q', 'website', 'is']

# Stemming and Lemmatization

In [10]:
# In the cell below, import the necessary libraries and define a function called stem_and_lemmatize that performs both 
# stemming and lemmatization on a list of words. Don't worry about the POS part of lemmatization for now.

In [11]:
def stem_and_lemmatize(l):
    
    ls_list = [] 
    
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    
    for word in l:
        lem = lemmatizer.lemmatize(word)
        stem = stemmer.stem(lem)
        ls_list.append(stem)
    
    return ls_list

token_list = tokenize(sentence)
stem_lem = stem_and_lemmatize(token_list)
stem_lem

['ironhack', 's', 'q', 'websit', 'is']

# Stop Words Removal

In [12]:
# Now in the cell below, create a function called remove_stopwords that loop through a list of words that have been stemmed
# and lemmatized to check and remove stop words. Return a new list where stop words have been removed.


In [13]:
def remove_stopwords(l):
    
    stop_words = set(stopwords.words('english'))
    
    filtered_sentence = [w for w in l if not w in stop_words]
    
    return filtered_sentence

remove_stopwords(stem_lem)



['ironhack', 'q', 'websit']