### Convert to Lower

In [2]:
import nltk
from nltk.tokenize import word_tokenize

def to_lower(text):
    """
    Converting text to lower case as in, converting "Hello" to  "hello" or "HELLO" to "hello".
    """
    return ' '.join([w.lower() for w in word_tokenize(text)])


text = """Harry Potter is the most miserable, lonely boy you can imagine. He's shunned by his relatives, \
          the Dursley's, that have raised him since he was an infant. He's forced to live in the cupboard under the 
          stairs, forced to wear his cousin Dudley's hand-me-down clothes, and forced to go to his neighbour's house when 
          the rest of the family is doing something fun. Yes, he's just about as miserable as you can get."""

print(to_lower(text))

harry potter is the most miserable , lonely boy you can imagine . he 's shunned by his relatives , the dursley 's , that have raised him since he was an infant . he 's forced to live in the cupboard under the stairs , forced to wear his cousin dudley 's hand-me-down clothes , and forced to go to his neighbour 's house when the rest of the family is doing something fun . yes , he 's just about as miserable as you can get .


### Filter Stop Words

In [7]:
from nltk.corpus import stopwords

text="Today is a great day. It is even better than yesterday. And yesterday was the best day ever!"
stopwords=set(stopwords.words('english'))

from nltk.tokenize import word_tokenize

words=word_tokenize(text)

wordsFiltered=[]

for w in words:
    if w not in stopwords:
        wordsFiltered.append(w)
print(wordsFiltered)


['Today', 'great', 'day', '.', 'It', 'even', 'better', 'yesterday', '.', 'And', 'yesterday', 'best', 'day', 'ever', '!']


### Stop word Removal using List Comprehension

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."

word_tokenize(example_sent)

stopwords = stopwords.words('english')

[x for x in word_tokenize(example_sent) if x not in stopwords]

['This',
 'sample',
 'sentence',
 ',',
 'showing',
 'stop',
 'words',
 'filtration',
 '.']

### Remove Stopward from List

In [4]:
import nltk

vocab = ['Joe', 'waited', 'for', 'the', 'train', 'The', 'train', 'was', 'late', 'Mary', 
         'and', 'Samantha', 'took', 'the', 'bus', 'I', 'looked', 'for', 'Mary', 'and', 
         'Samantha', 'at', 'the', 'bus', 'station', 'Mary', 'and', 'Samantha', 'arrived', 
         'at', 'the', 'bus', 'station', 'early', 'but', 'waited', 'until', 'noon', 'for', 
         'the', 'bus']
         
[v for v in (v.lower() for v in vocab) if v not in nltk.corpus.stopwords.words('english') ]

['joe',
 'waited',
 'train',
 'train',
 'late',
 'mary',
 'samantha',
 'took',
 'bus',
 'looked',
 'mary',
 'samantha',
 'bus',
 'station',
 'mary',
 'samantha',
 'arrived',
 'bus',
 'station',
 'early',
 'waited',
 'noon',
 'bus']

### Stopword_Remove_Pandas DataFrame

In [6]:
import pandas as pd

pos_tweets = [('I love this car', 'positive'),
              ('This view is amazing', 'positive'),
              ('I feel great this morning', 'positive'),
              ('I am so excited about the concert', 'positive'),
              ('He is my best friend', 'positive')]

test = pd.DataFrame(pos_tweets)


test = pd.DataFrame(pos_tweets)
test.columns = ["tweet","class"]

test["tweet"] = test["tweet"].str.lower().str.split()



from nltk.corpus import stopwords
stop = stopwords.words('english')

test['tweet'] = test['tweet'].apply(lambda x: [item for item in x if item not in stop])

test['tweet'] = test['tweet'].apply(lambda y: ' '.join([x for x in y]))

test

Unnamed: 0,tweet,class
0,love car,positive
1,view amazing,positive
2,feel great morning,positive
3,excited concert,positive
4,best friend,positive


### Remove Digits

In [8]:
text = "There was 200 people standing right next to me at 2pm."
output = ''.join(c for c in text if not c.isdigit())

print(output)


There was  people standing right next to me at pm.


### Remove Special Characters

In [9]:
import re

def remove_special_characters(text,remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern,'',text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@",True)

'Well this was fun What do you think '

### Remove punctuation

In [10]:
from string import punctuation

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

text = "Hello! how are you doing?"

print (strip_punctuation(text))


Hello how are you doing


### Remove Accented Charaters

In [None]:
import unicodedata

def Remove_accented_chars(text):
    text = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

Remove_accented_chars("Sèv asdaç")

### Remove Tags

In [11]:
import re
text = """<head><body>hello world!</body></head>"""
cleaned_text = re.sub('<[^<]+?>','', text)
print (cleaned_text)

hello world!


### Remove Duplicate Words

In [2]:
import nltk
import numpy as np
corpus = ["John likes to watch movies. Mary likes movies too.",
          "John also likes to watch football games.",]

vocab = []

# Generate  Vocabuary 
for sentence in corpus:
    w = nltk.word_tokenize(sentence)
    vocab.extend(w)

# Remove the Punctuatin
from string import punctuation
vocab = [v for v in vocab if v not in punctuation]

# lowercase conversion
vocab = [v.lower() for v in vocab]

# Remove Duplicates
vocab = list(dict.fromkeys(vocab))

vocab

['john',
 'likes',
 'to',
 'watch',
 'movies',
 'mary',
 'too',
 'also',
 'football',
 'games']

### Correct Repeating Character

In [None]:
import nltk
from nltk.corpus import wordnet
import re

def remove_repeated_characters(token):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution,old_word)
        return replace(new_word) if new_word != old_word else new_word
    
    correct_tokens = [replace(word) for word in token]
    return correct_tokens


sample_sentence = "My Schoool is realllllyyyy amaaaaazinggg"
correct_tokens = remove_repeated_characters(nltk.word_tokenize(sample_sentence))
' '.join(correct_tokens)

### Expanding Contractions

In [None]:
# coding: utf-8
import re
import nltk
from contractions import contractions_dict

def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),flags=re.IGNORECASE | re.DOTALL)
    
def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
        if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def main():
    text = """I ain't going there. You'll have to go alone."""
    
    text=expand_contractions(text,contractions_dict)
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    print (tokenized_sentences)
if __name__ == '__main__':
    main()

### Expanding Words Contractions

In [13]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have",
"It's": "It is",
"It'll" : "It will"
}


# Contractions conversions
def expand_text(text):
    words = text.split()
    
    new_text = []
    
    for word in words:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    
    return ' '.join(new_text)

print(expand_text("I'd like to know how I'd done that!"))

print(expand_text("We're going to the zoo and I don't think I'll be home for dinner."))

print(expand_text("Theyre going to the zoo and she'll be home for dinner."))


words =  ["I won't let you get away with that",
    "I'm a bad person",
    "It's his cat anyway",
    "It's not what you think",
    "It's a man's world",
    "Catherine's been thinking about it",
    "It'll be done",
    "Who'd've thought!",
    "She said she'd go.",
    "She said she'd gone.",
    "Y'all'd've a great time",
    " My name is Jack.",
    "'Tis questionable whether Ma'am should be going.",
    "As history tells, 'twas the night before Christmas.",
    "Martha, Peter and Christine've been indulging in a menage-à-trois.",] 


print("\n")

newwords = []
for word in words:
    newwords.append(expand_text(word))

newwords

I had / I would like to know how I had / I would done that!
We're going to the zoo and I do not think I shall / I will be home for dinner.
Theyre going to the zoo and she shall / she will be home for dinner.




['I will not let you get away with that',
 'I am a bad person',
 'It is his cat anyway',
 'It is not what you think',
 "It is a man's world",
 "Catherine's been thinking about it",
 'It will be done',
 "Who'd've thought!",
 'She said she had / she would go.',
 'She said she had / she would gone.',
 "Y'all'd've a great time",
 'My name is Jack.',
 "'Tis questionable whether Ma'am should be going.",
 "As history tells, 'twas the night before Christmas.",
 "Martha, Peter and Christine've been indulging in a menage-à-trois."]

### Normalize Docs

In [3]:

doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]


# Cleaning anf Preprocessing

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.tokenize import word_tokenize


#Stop words
stop = set(stopwords.words('english'))
exclude = set(punctuation)
lemma = WordNetLemmatizer()

def Clean(doc):
    
    tokens = word_tokenize(doc.lower())
    
    stop_free = [v for v in tokens if v not in stop]
    
    punc_free = [v for v in stop_free if v not in exclude]
    
    normalized= [WordNetLemmatizer().lemmatize(v) for v in punc_free]
    return ' '.join(v for v in normalized)
    
doc_clean = [Clean(doc).split() for doc in doc_complete]
    
doc_clean

[['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father'],
 ['father',
  'spends',
  'lot',
  'time',
  'driving',
  'sister',
  'around',
  'dance',
  'practice'],
 ['doctor',
  'suggest',
  'driving',
  'may',
  'cause',
  'increased',
  'stress',
  'blood',
  'pressure'],
 ['sometimes',
  'feel',
  'pressure',
  'perform',
  'well',
  'school',
  'father',
  'never',
  'seems',
  'drive',
  'sister',
  'better'],
 ['health', 'expert', 'say', 'sugar', 'good', 'lifestyle']]

In [3]:
# Text Normalization : Replace the selected words with new words
sentence = "I Visited US from UK on 22-10-18"

sentence.replace("US","United States").replace('UK',"United Kingdonm").replace("-18","-2018")

'I Visited United States from United Kingdonm on 22-10-2018'

### Spell Check

In [6]:
from nltk import word_tokenize

from autocorrect import spell

text = "This is a wrld of hope"
spells = [spell(w) for w in (word_tokenize(text))]
print (spells)

autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
autocorrect.spell is deprecated,             use autocorrect.Speller instead
['This', 'is', 'a', 'world', 'of', 'hope']


### Search for Keyword

In [2]:
# Search for the keyword using NLTK
import nltk
text = "This is a great day,It is even better than yesterday.And yesterday was the best day ever."

text = nltk.Text(nltk.word_tokenize(text))
match = text.concordance('is')

Displaying 2 of 2 matches:
This is a great day , It is even better than 
This is a great day , It is even better than yesterday.And yester


### RegEx

In [None]:
import nltk
import os
import re
import math
import operator
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('averaged_perceptron_tagger')
Stopwords = set(stopwords.words('english'))
wordlemmatizer = WordNetLemmatizer()

file = 'C:\\MyWork\\MyLearning\\ML\\Files\\DataSet\\SampleText.txt'
file = open(file , 'r')
text = file.read()
tokenized_sentence = sent_tokenize(text)
text = re.sub(r'[^a-zA-Z0-9\s]','',text)
text = re.sub(r'\d+', '', text)

tokenized_words_with_stopwords = word_tokenize(text)

tokenized_words = [word for word in tokenized_words_with_stopwords if word not in Stopwords]

tokenized_words = [word for word in tokenized_words if len(word) > 1]

tokenized_words = [word.lower() for word in tokenized_words]

tokenized_words = [wordlemmatizer.lemmatize(x) for x in tokenized_words]

### Object Standardization

In [4]:
# Object Standardization
# Text data often contains words or phrases which are not present in any standard lexical dictionaries. 
# These pieces are not recognized by search engines and models.

lookup_dict = {'rt':'Retweet', 'dm':'direct message', 'awsm' : 'awesome', 'luv' :'love'}

def _lookup_words(input_text):
    words = input_text.split()
    new_words = []
    for word in words:
        if word.lower() in lookup_dict:
             word = lookup_dict[word.lower()]
        new_words.append(word) 
        new_text = " ".join(new_words) 
    return new_text
    
print(_lookup_words("RT this is a retweeted tweet by Shivam Bansal"))


Retweet this is a retweeted tweet by Shivam Bansal
