## Cleaning Text

Clean white spaces in the text_data

In [1]:
text_data = ["  Interrobang. By aishwarya Henriette    ",
            "Parking And Going. By Karl Gautier",
            "    Today Is The night. By Jarek Prakash"]

# strip whitespaces
[string.strip() for string in  text_data]


['Interrobang. By aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [2]:
# remove periods
text_data_nopoints = [string.replace('.','') for string in text_data]


In [3]:
[string.replace('.','') for string in text_data]

['  Interrobang By aishwarya Henriette    ',
 'Parking And Going By Karl Gautier',
 '    Today Is The night By Jarek Prakash']

Capitalize the text

In [4]:
[string.upper() for string in text_data]

['  INTERROBANG. BY AISHWARYA HENRIETTE    ',
 'PARKING AND GOING. BY KARL GAUTIER',
 '    TODAY IS THE NIGHT. BY JAREK PRAKASH']

### See Also
* Beginners Tutorial for Regular Expressions in Python (https://www.analyticsvidhya.com/blog/2015/06/regular-expression-python/)

## Parsing and Cleaning HTML
Use Beautiful Soup to get the full name in the provided html

In [5]:
# beautiful soup es una biblioteca de Python que analiza HTML. Es Ãºtil para web scraping

from bs4 import BeautifulSoup

html = """
    <div class='full_name'><span style='font-weight:bold'>Yan</span> Chin</div>
"""

In [6]:
soup = BeautifulSoup(html, "html.parser")

print(soup.find("div", {"class":"full_name"}))
soup.find("span")

<div class="full_name"><span style="font-weight:bold">Yan</span> Chin</div>


<span style="font-weight:bold">Yan</span>

In [7]:
soup.text

'\nYan Chin\n'

### See Also
* Beautiful Soup documentation (https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

## Removing Punctuation
Remove punctuation of the text provided

In [8]:
import unicodedata
import sys
import re

text_data = ['Hi! I. Love. This. Song.....', '10000% Agree!!!! #LoveIT', 'Right?!?!']

#\w son palabras, \s son espacios en blanco
text_sin = re.sub(r"[^a-zA-z\s]", ""," ".join(text_data)).lower()
#text_sin = re.sub(r"[^\w\s]", ""," ".join(text_data)).lower()
text_sin

'hi i love this song  agree loveit right'

In [9]:

text_sin_pts = [re.sub(r"[^\w\s]", "", str) for str in text_data]
text_sin_pts

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

## Tokenizing Text

In [10]:
from nltk.tokenize import word_tokenize
string = "The science of today is the technology of tomorrow"

#tokenize words

l = word_tokenize(string)
l

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [11]:
from nltk.tokenize import sent_tokenize
string = "The science of today is the technology of tomorrow. Tomorrow is today"

sent_tokenize (string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today']

## Removing Stop Words

In [12]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

tokenized_words = ['i', 'am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']


stop_words = stopwords.words('english')

[word for word in tokenized_words if word not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pilar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['going', 'go', 'store', 'park']

## Stemming Words

In [13]:
from nltk.stem.porter import PorterStemmer

tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']
stemmer = PorterStemmer()
stem_words = [(word,stemmer.stem(word)) for word in tokenized_words]
stem_words

[('i', 'i'),
 ('am', 'am'),
 ('humbled', 'humbl'),
 ('by', 'by'),
 ('this', 'thi'),
 ('traditional', 'tradit'),
 ('meeting', 'meet')]

In [14]:
 
# importing modules 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 
   
sentence = "Programers program with programing languages"
words = word_tokenize(sentence) 
   
for w in words: 
    print(w, " : ", ps.stem(w)) 

Programers  :  program
program  :  program
with  :  with
programing  :  program
languages  :  languag


### See Also
* Porter Stemming Algorithm (https://tartarus.org/martin/PorterStemmer/)





## Encoding Text as a Bag of Words

In [16]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Gremany beats both'])

In [19]:
# Convert to array and show the result
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(text_data)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [20]:
text_data

array(['I love Brazil. Brazil!', 'Sweden is best', 'Gremany beats both'],
      dtype='<U22')

In [24]:
# show the feature names
cv.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'gremany': 4,
 'beats': 0,
 'both': 2}

In [26]:
cv.get_feature_names()

['beats', 'best', 'both', 'brazil', 'gremany', 'is', 'love', 'sweden']



## Weighting Word Importance

In [33]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])

# create the tf-idf feature matrix
v = TfidfVectorizer()
v_matrix = v.fit_transform(text_data) #Matriz DISPERSA
v.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [34]:
# Convert to array and show the result
v_matrix = v_matrix.toarray()
v_matrix

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [44]:
# show the vocabulary
v.get_feature_names()
v.vocabulary_


{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

$$
tfidf(t, d) = tf(t,d) * idf(t)
$$

where $t$ is a word

$d$ is a document

$$
idf(t) = log(\frac{1 + n_d}{1 + df(d, t}) +1
$$

where $n_d$ is the number of documents and 

$df(d,t)$ is term, $t$'s document frequency (i.e. number of documents where the term appears)

### See Also
* scikit-learn documentation: tf-idf term weighting (http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting)