##### **T1.** Pick an article from wiki (using requests package or any other) in XML or HTML format and write a simple parser using regular expressions that does following:
1. Replace all boldface and italic tags <b></b> and  <i></i> with the content between them. E.g. "<b>machine learning</b>" should become "machine learning".
2. Replace all hyperref tags with the content inside: e.g. "<a href=...>content</a>" should become "content".
3. Do same as in 2 for <span ...><\span> and <sup></sup> tags.

In [2]:
import re
import requests

In [2]:
def clean_text(paragraph, n):
    p = paragraph[n]

    href_pattern = r"<a href=[^>]+>([^<]+)</a>"
    p = re.sub(href_pattern, r"\1", p)

    boldface_pattern = r"<b>([^<]+)</b>"
    p = re.sub(boldface_pattern, r"\1", p)

    italic_pattern = r"<i>([^<]+)</i>"
    p = re.sub(italic_pattern, r"\1", p)

    span_pattern = r"<span\s[^<]+</span>"
    p = re.sub(span_pattern, "", p)

    sup_pattern = r"<sup\s[^<]+</sup>"
    p = re.sub(sup_pattern, "", p)

    return p

In [3]:
r = requests.get(url="https://en.wikipedia.org/wiki/Machine_learning")
wiki_text = r.text

In [4]:
paragraph = re.findall(r"<p>(.*)\n", wiki_text)

In [12]:
paragraphs_num = range(50)

with open('text.txt', 'w') as f:
    for n in paragraphs_num:
        p = clean_text(paragraph, n)

        #print('BEFORE: ', paragraph[n])
        #print('AFTER: ', p)

        f.write(p)
        #f.write('\n')

##### **T2.** Use regular expressions to split text into sentences and sentences into tokens. Compare your results with the results of tokenizer from NLTK or SpaCy.

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize 
import spacy

In [9]:
p = clean_text(paragraph, 0)
p

'Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. Recently, artificial neural networks have been able to surpass many previous approaches in performance.'

In [3]:
sent_my = "I don't know about New York"
p_word_regex = re.findall(r"([\w]+|\(|\)|\.|,)", sent_my)
print(p_word_regex) 

['I', 'don', 't', 'know', 'about', 'New', 'York']


In [5]:
p_word_nltk = word_tokenize(sent_my)
print(p_word_nltk)

['I', 'do', "n't", 'know', 'about', 'New', 'York']


In [None]:
p_sent_regex = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", sent_my)
print(p_sent_regex) 

In [70]:
# regex
# sentences
p_sent_regex = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", p)
print(p_sent_regex) 

# words
p_word_regex = re.findall(r"([\w]+|\(|\)|\.|,)", p_sent_regex[0])
print(p_word_regex) 
print(len(p_word_regex))

['Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.', 'Recently, artificial neural networks have been able to surpass many previous approaches in performance.']
['Machine', 'learning', '(', 'ML', ')', 'is', 'a', 'field', 'of', 'study', 'in', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'development', 'and', 'study', 'of', 'statistical', 'algorithms', 'that', 'can', 'learn', 'from', 'data', 'and', 'generalize', 'to', 'unseen', 'data', ',', 'and', 'thus', 'perform', 'tasks', 'without', 'explicit', 'instructions', '.']
41


In [45]:
# NLTK
# sentences
p_sent_nltk = sent_tokenize(p)
print(p_sent_nltk) 

# words
p_word_nltk = word_tokenize(p_sent_nltk[0])
print(p_word_nltk) 
print(len(p_word_nltk))

['Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.', 'Recently, artificial neural networks have been able to surpass many previous approaches in performance.']
['Machine', 'learning', '(', 'ML', ')', 'is', 'a', 'field', 'of', 'study', 'in', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'development', 'and', 'study', 'of', 'statistical', 'algorithms', 'that', 'can', 'learn', 'from', 'data', 'and', 'generalize', 'to', 'unseen', 'data', ',', 'and', 'thus', 'perform', 'tasks', 'without', 'explicit', 'instructions', '.']
41


In [43]:
# SpaCy
nlp_spacy = spacy.load('en_core_web_sm')

# sentences
p_spicy = nlp_spacy(p)
p_sent_spicy = [sent for sent in p_spicy.sents]
print(p_sent_spicy) 

# words
p_word_spicy = [token.text for token in p_sent_spicy[0]]
print(p_word_spicy) 
print(len(p_word_spicy))

[Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions., Recently, artificial neural networks have been able to surpass many previous approaches in performance.]
['Machine', 'learning', '(', 'ML', ')', 'is', 'a', 'field', 'of', 'study', 'in', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'development', 'and', 'study', 'of', 'statistical', 'algorithms', 'that', 'can', 'learn', 'from', 'data', 'and', 'generalize', 'to', 'unseen', 'data', ',', 'and', 'thus', 'perform', 'tasks', 'without', 'explicit', 'instructions', '.']
41


##### **T3.** Normalize your text using NLTK or SpaCy. 

In [86]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
p = clean_text(paragraph, 0)

In [83]:
# NLTK
# sentences
p_sent_nltk = sent_tokenize(p)
print(p_sent_nltk) 

# words
p_word_nltk = []
for sentence in p_sent_nltk:
    p_word_nltk += [word for word in word_tokenize(sentence)]

print(p_word_nltk) 
print(len(p_word_nltk))

['Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.', 'Recently, artificial neural networks have been able to surpass many previous approaches in performance.']
['Machine', 'learning', '(', 'ML', ')', 'is', 'a', 'field', 'of', 'study', 'in', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'development', 'and', 'study', 'of', 'statistical', 'algorithms', 'that', 'can', 'learn', 'from', 'data', 'and', 'generalize', 'to', 'unseen', 'data', ',', 'and', 'thus', 'perform', 'tasks', 'without', 'explicit', 'instructions', '.', 'Recently', ',', 'artificial', 'neural', 'networks', 'have', 'been', 'able', 'to', 'surpass', 'many', 'previous', 'approaches', 'in', 'performance', '.']
57


In [89]:
# NLTK
# stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in p_word_nltk]
print(stemmed_words)
print(len(stemmed_words))
#stemmed_paragraphs = [' '.join(stemmed_words[i:i+10])for i in range(0, len(stemmed_words), 10)]
#print(stemmed_paragraphs)

# lemmatization 
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words if word not in set (stopwords.words('english'))]
print(lemmatized_words)
print(len(lemmatized_words))


['machin', 'learn', '(', 'ml', ')', 'is', 'a', 'field', 'of', 'studi', 'in', 'artifici', 'intellig', 'concern', 'with', 'the', 'develop', 'and', 'studi', 'of', 'statist', 'algorithm', 'that', 'can', 'learn', 'from', 'data', 'and', 'gener', 'to', 'unseen', 'data', ',', 'and', 'thu', 'perform', 'task', 'without', 'explicit', 'instruct', '.', 'recent', ',', 'artifici', 'neural', 'network', 'have', 'been', 'abl', 'to', 'surpass', 'mani', 'previou', 'approach', 'in', 'perform', '.']
57
['machin', 'learn', '(', 'ml', ')', 'field', 'studi', 'artifici', 'intellig', 'concern', 'develop', 'studi', 'statist', 'algorithm', 'learn', 'data', 'gener', 'unseen', 'data', ',', 'thu', 'perform', 'task', 'without', 'explicit', 'instruct', '.', 'recent', ',', 'artifici', 'neural', 'network', 'abl', 'surpass', 'mani', 'previou', 'approach', 'perform', '.']
39


In [95]:
# NLTK
p_sent_nltk_normalized = []

for i in range(len(p_sent_nltk)):
    p_word_nltk = [word for word in word_tokenize(p_sent_nltk[i])]
    # stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in p_word_nltk]

    # lemmatization 
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words if word not in set (stopwords.words('english'))]

    p_sent_nltk_normalized.append(' '.join(lemmatized_words))

print(p_sent_nltk_normalized)

['machin learn ( ml ) field studi artifici intellig concern develop studi statist algorithm learn data gener unseen data , thu perform task without explicit instruct .', 'recent , artifici neural network abl surpass mani previou approach perform .']
