<a href="https://colab.research.google.com/github/Sandeep-ML-DL-NLP/NLP-Projects/blob/main/2_NLP_Basics_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# building vectors and matrices based on text data.
from sklearn.feature_extraction.text import CountVectorizer
X = ("Computers can analyze text",
 "They do it using vectors and matrices",
 "Computers can process massive amounts of text data")
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
print(vectorizer.vocabulary_) 
print(X_vec.todense())

{'computers': 2, 'analyze': 1, 'text': 7, 'using': 8, 'vectors': 9, 'matrices': 5, 'process': 6, 'massive': 4, 'amounts': 0, 'data': 3}
[[0 1 1 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 1]
 [1 0 1 1 1 0 1 1 0 0]]


In [14]:
# Bag of words 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np 
import pandas as pd 
import re 
from nltk.tokenize import TreebankWordDetokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
sentences = ["We are reading about Natural Language Processing Here",
"Natural Language Processing making computers comprehend language \
data",
"The field of Natural Language Processing is evolving everyday"]
corpus = pd.Series(sentences)
corpus

0    We are reading about Natural Language Processi...
1    Natural Language Processing making computers c...
2    The field of Natural Language Processing is ev...
dtype: object

In [41]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [58]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = stopwords

    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [50]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [51]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [52]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus
common_dot_words = ['U.S.A', 'Mr.', 'Mrs.', 'D.C.']

In [59]:
preprocessed_corpus = preprocess(corpus, \
 keep_list = common_dot_words, stemming = False, \
 stem_type = None, lemmatization = True, \
 remove_stopwords = True)
preprocessed_corpus


  cleaned_corpus = pd.Series()
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
  cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))


['read natural language process',
 'natural language process make computers comprehend language data',
 'field natural language process evolve everyday']

In [60]:
#Build your vocabulary
set_of_words = set()
for sentences in preprocessed_corpus:
  for word in sentences.split():
    set_of_words.add(word)
vocab = list(set_of_words)
vocab

['make',
 'natural',
 'field',
 'evolve',
 'data',
 'process',
 'everyday',
 'language',
 'read',
 'comprehend',
 'computers']

In [61]:
position = {}
for i,token in enumerate(vocab):
  position[token] = i

position

{'make': 0,
 'natural': 1,
 'field': 2,
 'evolve': 3,
 'data': 4,
 'process': 5,
 'everyday': 6,
 'language': 7,
 'read': 8,
 'comprehend': 9,
 'computers': 10}

In [66]:
bow_matrix = np.zeros((len(preprocessed_corpus),len(vocab)))
bow_matrix.shape

(3, 11)

In [68]:
for i,preprocessed_sentence in enumerate(preprocessed_corpus):
  for token in preprocessed_sentence.split():
     bow_matrix[i][position[token]] = bow_matrix[i][position[token]]+1
bow_matrix 

array([[0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0.],
       [1., 1., 0., 0., 1., 1., 0., 2., 0., 1., 1.],
       [0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0.]])

In [70]:
# Hey! Do I need to code all this up? Doesn't any Python library provide all this as an inbuilt functionality ?
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [72]:
bow_matrix.toarray()

array([[0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1],
       [1, 1, 1, 0, 0, 0, 2, 1, 1, 1, 0],
       [0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0]])

In [79]:
# CountVectorizer with ngrams
vectorizer_ngram_range = CountVectorizer(analyzer='word',
ngram_range=(1,3))
bow_matrix_ngram = \
vectorizer_ngram_range.fit_transform(preprocessed_corpus)
print(vectorizer_ngram_range.vocabulary_)
print(bow_matrix_ngram.toarray())

{'read': 29, 'natural': 21, 'language': 13, 'process': 24, 'read natural': 30, 'natural language': 22, 'language process': 15, 'read natural language': 31, 'natural language process': 23, 'make': 18, 'computers': 3, 'comprehend': 0, 'data': 6, 'process make': 27, 'make computers': 19, 'computers comprehend': 4, 'comprehend language': 1, 'language data': 14, 'language process make': 17, 'process make computers': 28, 'make computers comprehend': 20, 'computers comprehend language': 5, 'comprehend language data': 2, 'field': 10, 'evolve': 8, 'everyday': 7, 'field natural': 11, 'process evolve': 25, 'evolve everyday': 9, 'field natural language': 12, 'language process evolve': 16, 'process evolve everyday': 26}
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1]
 [1 1 1 1 1 1 1 0 0 0 0 0 0 2 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0]]


In [83]:
# TF-IDF 
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(norm="l1")
tf_matrix = tf.fit_transform(preprocessed_corpus)
tf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.21307663, 0.        , 0.21307663, 0.21307663,
        0.3607701 ],
       [0.1571718 , 0.1571718 , 0.1571718 , 0.        , 0.        ,
        0.        , 0.1856564 , 0.1571718 , 0.0928282 , 0.0928282 ,
        0.        ],
       [0.        , 0.        , 0.        , 0.2095624 , 0.2095624 ,
        0.2095624 , 0.12377093, 0.        , 0.12377093, 0.12377093,
        0.        ]])

In [81]:
tf_matrix.shape

(3, 11)

In [120]:
# One-hot vectorization
from nltk.corpus import stopwords
stem = PorterStemmer()
stem2 = SnowballStemmer(language='english')
sentence = ["We are reading about Natural Language Processing \
Here"]
corpus = pd.Series(sentence)
for rows in corpus:
  tokens = [token.lower() for token in rows.split() if token.lower() not in stopwords.words('english')]
  preprocess_word = [stem.stem(token) for token in tokens]
  preprocessed_word = [stem2.stem(token) for token in tokens]
  print(preprocess_word)
  print(preprocessed_word)

['read', 'natur', 'languag', 'process']
['read', 'natur', 'languag', 'process']


In [121]:
position2 = {}
for i,token in enumerate(preprocess_word):
  position2[token] = i
position2

{'read': 0, 'natur': 1, 'languag': 2, 'process': 3}

In [122]:
one_hot_matrix = np.zeros((4,4))
for i,token in enumerate(preprocess_word):
  one_hot_matrix[i][position2[token]] = 1
one_hot_matrix

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])