<a href="https://colab.research.google.com/github/ShareAndShine/Lemalabs/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
s1 = 'My name is ------------------ Pawan & *#Raj'
s2 = 'Today is very ****** good day'
s3 = 'How can I _ help you ?'

In [52]:
# Step 1 - convert text into lowercase
s1 = s1.lower() # In NLP step 1 is always convert into lower case then tokenize and clean
s2 = s2.lower()
s3 = s3.lower()

In [53]:
# Step 2 - Tokenize 

import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [54]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+')

def tok(x): # helper method to split words in the sentence and holds in an array
  return (tokenizer.tokenize(x))

In [55]:
print(tok(s1))
print(tok(s2))
print(tok(s3))


['my', 'name', 'is', 'pawan', 'raj']
['today', 'is', 'very', 'good', 'day']
['how', 'can', 'i', '_', 'help', 'you']


In [56]:
s1 = tok(s1)
s2 = tok(s2)
s3 = tok(s3)
s1

['my', 'name', 'is', 'pawan', 'raj']

In [57]:
# Step 3 - Cleaning

# Remove any spill over special characters from the above step which do not add any value

import re # import regular expression python lib 

pattern ="_"

s3 = [re.sub(pattern,'',i) for i in s3]
s3

['how', 'can', 'i', '', 'help', 'you']

**Sample **

In [58]:
ex = 'I am _ Rajesh 15'

ex = tok(ex.lower())
ex

# Remove special characters
pattern = "_"
ex = [re.sub(pattern,'',i) for i in ex]
ex

# Remove numbers
pattern = "[0-9]"
ex = [re.sub(pattern,'',i) for i in ex]
ex


['i', 'am', '', 'rajesh', '']

In [59]:
from nltk.corpus import stopwords
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [60]:
sw = stopwords.words('english')
sw # lists all stops words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [61]:
s1 = [ word for word in s1 if word not in sw]
s2 = [ word for word in s2 if word not in sw]
s3 = [ word for word in s3 if word not in sw]
s3

['', 'help']

**# LETS DO ALL THE ABOVE STEPS IN ONE GO WITH SKLEARN ALGORITHUM**

In [62]:


s1 = 'Send us your password'
s2 = 'Send us your review'
s3 = 'Review your password'
s4 = 'Review us'
s5 = 'Send your password'
s6 = 'Send us your account'

# create an array to hold all sentences
x = [s1, s2, s3, s4, s5, s6]
x

['Send us your password',
 'Send us your review',
 'Review your password',
 'Review us',
 'Send your password',
 'Send us your account']

In [63]:
def process_text(text): # helper function to remove special chars and numbers
   #Remove special characters
  pattern = "_"
  text = [re.sub(pattern,'',i) for i in text]
  
  # Remove numbers
  pattern1 = "[0-9]"
  text = [re.sub(pattern,'',i) for i in text]
  return text
  

In [64]:
x = process_text(x)
x

['Send us your password',
 'Send us your review',
 'Review your password',
 'Review us',
 'Send your password',
 'Send us your account']

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
Vectorizer = CountVectorizer(tokenizer=tok, stop_words=sw, ngram_range=(1,1)) # send all functions defined above as an argument to vectorizer function

Vector_x = Vectorizer.fit_transform(x) # Forms a bag of words matrix 
print(Vector_x.toarray())

[[0 1 0 1 1]
 [0 0 1 1 1]
 [0 1 1 0 0]
 [0 0 1 0 1]
 [0 1 0 1 0]
 [1 0 0 1 1]]


In [66]:
Vectorizer.get_feature_names() # if you wonder what unique words or features on which bag of words matrix was built..use this

['account', 'password', 'review', 'send', 'us']

In [69]:
from sklearn.feature_extraction.text import TfidfTransformer
Tfid = TfidfTransformer()

Vector_x = Tfid.fit_transform(Vector_x) # closest to 1 signifies word is important as repeated in many places 
print(Vector_x.toarray())

[[0.         0.63646032 0.         0.54539814 0.54539814]
 [0.         0.         0.63646032 0.54539814 0.54539814]
 [0.         0.70710678 0.70710678 0.         0.        ]
 [0.         0.         0.7593387  0.         0.65069558]
 [0.         0.7593387  0.         0.65069558 0.        ]
 [0.76608386 0.         0.         0.45448626 0.45448626]]
