# Noise Removal

 * remove digits from a string.
 * remove hyperlinks from a string.
 * expand the contractions of a string.
 * strip punctuation marks and special characters of a string.
 * remove emoticons from a string.
 * remove whitespaces from a string.
 * correct spelling errors in a string.

In [None]:
!pip install emoji-data-python

In [None]:
!pip install autocorrect

In [3]:
import nltk
import string
import re
import emoji_data_python
from autocorrect import Speller
spell = Speller(lang='en')

In [None]:
print(emoji_data_python.find_by_name('flag'))
print(emoji_data_python.find_by_shortname('flag'))
print(emoji_data_python.unified_to_char('1F600'))

[EmojiChar("CROSSED FLAGS"), EmojiChar("CHEQUERED FLAG"), EmojiChar("RAINBOW FLAG"), EmojiChar("TRANSGENDER FLAG"), EmojiChar("WHITE FLAG"), EmojiChar("PIRATE FLAG"), EmojiChar("WAVING BLACK FLAG"), EmojiChar("CLOSED MAILBOX WITH LOWERED FLAG"), EmojiChar("CLOSED MAILBOX WITH RAISED FLAG"), EmojiChar("OPEN MAILBOX WITH RAISED FLAG"), EmojiChar("OPEN MAILBOX WITH LOWERED FLAG"), EmojiChar("TRIANGULAR FLAG ON POST"), EmojiChar("FLAG IN HOLE")]
[EmojiChar("Lesotho Flag"), EmojiChar("Seychelles Flag"), EmojiChar("Lithuania Flag"), EmojiChar("Sudan Flag"), EmojiChar("Luxembourg Flag"), EmojiChar("Sweden Flag"), EmojiChar("Latvia Flag"), EmojiChar("Singapore Flag"), EmojiChar("Libya Flag"), EmojiChar("St. Helena Flag"), EmojiChar("Morocco Flag"), EmojiChar("Slovenia Flag"), EmojiChar("Monaco Flag"), EmojiChar("Svalbard & Jan Mayen Flag"), EmojiChar("Moldova Flag"), EmojiChar("Slovakia Flag"), EmojiChar("Montenegro Flag"), EmojiChar("Sierra Leone Flag"), EmojiChar("St. Martin Flag"), EmojiCha

### Remove noise in ONE GO

In [4]:
def clean_text(text):
  #remove digit
  #text= ''.join(c for c in text if not c.isdigit())
  text = re.sub(r'\d+', '', text)
  # remove white space 
  text= re.sub(r"^\s+|\s+$", "",text) # re.sub replaces one or more occurrences of the matches at the start and end of line with the replacement string
  #remove url
  text = re.sub(r"http\S+", "", text)
  #expand contractions
  text = re.sub(r"\'re", " are", text)
  text = re.sub(r"\'m", " am", text)
  #remove puntuation & special case
  #text = ''.join(c for c in text if c not in string.punctuation)
  text = re.sub(r'[^\w\s]','',text)
  #remove emoji
  text = emoji_data_python.get_emoji_regex().sub("", text)
  text = " ".join(text.split())
  #correct spell 
  text = spell(text)
  return text

In [5]:
text = "@ I'm making 200 coffee <for> everione at #work & you're only //   making coffee for yourself 😊.?https://edition.cnn.com/"

In [6]:
clean_text(text)

' I am making  coffee for everyone at work  you are only    making coffee for yourself '

### Alternative methods:  filtering Manaully (withour re)

In [None]:
text = "@ I'm making 200 coffee <for> everione at #work & you're only // making coffee for yourself 😊.?https://edition.cnn.com/"

In [None]:
# remove numeric value 

text= ''.join(c for c in text if not c.isnumeric())
text

"@ I'm making  coffee <for> everione at #work & you're only // making coffee for yourself 😊.?https://edition.cnn.com/"

In [None]:
# REMOVE PUNCTUATION & SPECIAL CHARACTERS
import string # import the string module
string.punctuation # print a set of punctuation characters

for c in string.punctuation: # iterate over the set of punctuation characters
  text = text.replace(c, "") # create the object text and with the replace method return the string text in which the old value ( c ) will have been replaced by the new value ( "" )

text # print the new string

' Im making  coffee for everione at work  youre only  making coffee for yourself 😊httpseditioncnncom'

# Text normalization 

* **stemming** and **lemmatization** techniques
  * `stemming` is concerned with removing affixes (prefixes and suffixes)
  * `lemmatization` is concerned with bringing words down to their dictionary
form which is known as the lemma. Lemmatization usually refers to the
morphological analysis of words.
from a word and reducing the word to its stem or root form.
* remove **stop words** from a string.
  * `stop words` are words which occur very often and do not add much
value to the meaning of the text. They take up space in our dataset
or valuable processing time, thus we filter them out of our text.

### NORMALISE A TEXT IN ONE GO

In [39]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

def norm_text(text):
  #tokenize the text into words
  tokens = word_tokenize(text)
  # convert the upper cases into lower cases
  tokens = [word.lower() for word in tokens]
  # remove the stopwords
  tokens = [word for word in tokens if not word in stopwords.words("english")]
  # lemmatize the words
  lemma = WordNetLemmatizer()
  tokens = [lemma.lemmatize(word) for word in tokens]
  return tokens

In [40]:
text = "she reads MANY BOOKS at home and i think she is the quickest reader i know"
norm_text(text)

['read', 'many', 'book', 'home', 'think', 'quickest', 'reader', 'know']

### Stemming

In [7]:
# PORTER STEMMER
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer() #create an instance of the class PorterStemmer
stemmer.stem('active') # call the .stem() method on the object stemmer to stem the word active

'activ'

In [10]:
text = 'What Gabriela was experiencing was a cultural clash in expectations. She was used to a more hierarchical framework where the team leader and manager took control and gave specific instructions on how things were to be \done. This more directive management style worked well for her and her team in Brazil but did not transfer well to her new team in Sweden, who were more used to a flatter hierarchy where decision making was more democratic. When Gabriela took the issue to her Swedish manager, rather than stepping in with directions about what to do, her manager took on the role of coach and focused on getting her to come up with her own solutions instead.'



In [13]:
from nltk.tokenize import word_tokenize

words = word_tokenize(text)
stemmer = PorterStemmer()

for word in words[:10]:
  print ((word, stemmer.stem(word)))

('What', 'what')
('Gabriela', 'gabriela')
('was', 'wa')
('experiencing', 'experienc')
('was', 'wa')
('a', 'a')
('cultural', 'cultur')
('clash', 'clash')
('in', 'in')
('expectations', 'expect')


### Lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [18]:
lemma = WordNetLemmatizer() #create an instance of the class WordNetLemmatizer
lemma.lemmatize('runs') # call the .lemmatize () method on the object lemma to lemmatize the word runs

'run'

In [19]:
lemma.lemmatize('better', pos = 'a') # add the argument pos = 'a'(adjective)

'good'

In [27]:
for word in words[30:35]:
  print((word, lemma.lemmatize(word)))

('instructions', 'instruction')
('on', 'on')
('how', 'how')
('things', 'thing')
('were', 'were')


### Stop words

In [28]:
nltk.download('stopwords')
from nltk.corpus import stopwords


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [36]:
text = "she reads many books at home and i think she is the quickest reader i know"
tokens = word_tokenize(text)
print("original text : ",tokens, end='\n\n')
print("stop words : ", stopwords.words('english'), end='\n\n') # define the list of English stopwords

remove_stopwords= [word for word in tokens if word not in stopwords.words()]
# for loop: iterate over the sequence of the words in the tokens list and check if the word exists in the list of stop words 
print("result : ", remove_stopwords)

original text :  ['she', 'reads', 'many', 'books', 'at', 'home', 'and', 'i', 'think', 'she', 'is', 'the', 'quickest', 'reader', 'i', 'know']

stop words :  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'wh

**REMOVE STOP WORDS FROM THE NLTK LIST OF STOP WORDS**

In [38]:
text = "she reads many books at home and i think she is the quickest reader i know"
tokens = word_tokenize(text)

new_stopwords = stopwords.words("english")
new_stopwords.remove("she") # remove she from the list of stopwords

filtered_text = [she for she in tokens if not she in new_stopwords]

print(" ".join(filtered_text))

she reads many books home think she quickest reader know
