# Convert column text to lower case

In [None]:
df['column_name'] = df['column_name'].str.lower()

# Re : Regular Expression

In [1]:
import re

## Remove Html Tags 

In [2]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [3]:
# Example
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [None]:
# Example 
df['column_name'] = df['column_name'].apply(remove_html_tags)

## Remove URL

In [4]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [5]:
text1 = 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1abb'
remove_url(text1)

'Check out my notebook '

## Remove Punctuations

#### method 1

In [6]:
import string
exclude = string.punctuation

In [7]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text
        

In [8]:
text = 'string. With. Punctuation?'
remove_punc(text)

'string With Punctuation'

#### method 2

this method is 5 times faster than previous one 

In [9]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [10]:
remove_punc1(text)

'string With Punctuation'

## Chat Conversion

github repository: 'https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt'

Store it in form of dictionry in a variable named chat_words

In [None]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

## Spelling Checker

In [None]:
from textblob import TextBlob

In [None]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

textBlb = TextBlob(incorrect_text)

textBlb.correct().string

## Removing Stop Words

Usually used in sentimental analysis and POS where we dont want words like (a,an,the,is,am,are....)

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [None]:
df['column_name'].apply(remove_stopwords)

## Remove Emoji

#### method1

In [12]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [13]:
remove_emoji("Loved the movie. It was 😘😘")

'Loved the movie. It was '

#### method2

In [None]:
import emoji
print(emoji.demojize('Python is 🔥'))

## Tokenization

#### 1. Using the split function

In [15]:
# word tokenization
sent1 = 'I am going to delhi'
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [16]:
# sentence tokenization
sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [17]:
# Problems with split function
sent3 = 'I am going to delhi!'
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

comment: here it is not able to split ! from delhi

In [18]:
sent4 = 'Where do think I should go? I have 3 day holiday'
sent4.split('.')

['Where do think I should go? I have 3 day holiday']

comment: it is only able to split on basis of one punctuation!

#### 2. Regular Expression

In [19]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+", sent3)
tokens

  tokens = re.findall("[\w']+", sent3)


['I', 'am', 'going', 'to', 'delhi']

In [20]:

text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

['Lorem Ipsum is simply dummy text of the printing and typesetting industry',
 "\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

#### 3. NLTK

In [None]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [None]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

comment: the problem here is it is not able to seprate symbols like $ km etc.

#### 4. Spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
doc1 = nlp(sentence)  # in this library we need to convert the sentence into documnet to apply the function

comment: It is best method among all !!

## Stemming

It refers to converting noun pronoun verb and other forms of a word into one word for easy machine understanding 

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [None]:
sample = "walk walks walking walked"
stem_words(sample)

## Lemmatization

Similar to stemming but it finds the english word for the word rather than conviniently writing any other latin transformation

it is slower than stemming and should only be used if the output is to be shown to the user

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))