<a href="https://colab.research.google.com/github/RaviGprec/NLP/blob/master/N_Grams_Intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **N-Grams**

*   An n-gram is a contiguous sequence of n items from a given sample of text or speech. The items can be phonemes, syllables, letters, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.
*   An n-gram of size 1 is referred to as a "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size 3 is a "trigram". English cardinal numbers are sometimes used, e.g., "four-gram", "five-gram", and so on
*   An n-gram model is a type of probabilistic **language model** **[LM]**for predicting the next item in a sequence


In [0]:
s = "Natural-language processing (NLP) is an area of computer science " \
    "and artificial intelligence concerned with the interactions " \
    "between computers and human (natural) languages."

In [7]:
s

'Natural-language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages.'

In [8]:
import re
s = re.sub(r'[^a-zA-Z0-9\s]',' ',s)
s

'Natural language processing  NLP  is an area of computer science and artificial intelligence concerned with the interactions between computers and human  natural  languages '

In [9]:
tokens = [token for token in s.split(" ") if token != ""]
tokens

['Natural',
 'language',
 'processing',
 'NLP',
 'is',
 'an',
 'area',
 'of',
 'computer',
 'science',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'natural',
 'languages']

In [10]:
for i in range(5):
  print(tokens[i:])


['Natural', 'language', 'processing', 'NLP', 'is', 'an', 'area', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'natural', 'languages']
['language', 'processing', 'NLP', 'is', 'an', 'area', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'natural', 'languages']
['processing', 'NLP', 'is', 'an', 'area', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'natural', 'languages']
['NLP', 'is', 'an', 'area', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'natural', 'languages']
['is', 'an', 'area', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 

In [11]:
ngrams = zip(*[tokens[i:] for i in range(5)])
print(tuple(ngrams))

(('Natural', 'language', 'processing', 'NLP', 'is'), ('language', 'processing', 'NLP', 'is', 'an'), ('processing', 'NLP', 'is', 'an', 'area'), ('NLP', 'is', 'an', 'area', 'of'), ('is', 'an', 'area', 'of', 'computer'), ('an', 'area', 'of', 'computer', 'science'), ('area', 'of', 'computer', 'science', 'and'), ('of', 'computer', 'science', 'and', 'artificial'), ('computer', 'science', 'and', 'artificial', 'intelligence'), ('science', 'and', 'artificial', 'intelligence', 'concerned'), ('and', 'artificial', 'intelligence', 'concerned', 'with'), ('artificial', 'intelligence', 'concerned', 'with', 'the'), ('intelligence', 'concerned', 'with', 'the', 'interactions'), ('concerned', 'with', 'the', 'interactions', 'between'), ('with', 'the', 'interactions', 'between', 'computers'), ('the', 'interactions', 'between', 'computers', 'and'), ('interactions', 'between', 'computers', 'and', 'human'), ('between', 'computers', 'and', 'human', 'natural'), ('computers', 'and', 'human', 'natural', 'languages

In [0]:


def ngrams_generator(s,n):
  s = s.lower() #converting the all the text to lowercase
  s = re.sub(r'[^a-zA-Z0-9\s]',' ',s) #replace all the non-alphanumeric charecters with spaces
  tokens = [token for token in s.split(" ") if token != ""] # Break sentence in the token, remove empty tokens
  ngrams = zip(*[tokens[i:] for i in range(n)])
  return [" ".join(ngram) for ngram in ngrams]


In [13]:
ngrams_generator(s,n=5)

['natural language processing nlp is',
 'language processing nlp is an',
 'processing nlp is an area',
 'nlp is an area of',
 'is an area of computer',
 'an area of computer science',
 'area of computer science and',
 'of computer science and artificial',
 'computer science and artificial intelligence',
 'science and artificial intelligence concerned',
 'and artificial intelligence concerned with',
 'artificial intelligence concerned with the',
 'intelligence concerned with the interactions',
 'concerned with the interactions between',
 'with the interactions between computers',
 'the interactions between computers and',
 'interactions between computers and human',
 'between computers and human natural',
 'computers and human natural languages']

**Using NLTK Library**

In [0]:
s = "Natural-language processing (NLP) is an area of computer science " \
    "and artificial intelligence concerned with the interactions " \
    "between computers and human (natural) languages."
    
from nltk.util import ngrams

s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
output = list(ngrams(tokens, 5))

In [15]:
output

[('natural', 'language', 'processing', 'nlp', 'is'),
 ('language', 'processing', 'nlp', 'is', 'an'),
 ('processing', 'nlp', 'is', 'an', 'area'),
 ('nlp', 'is', 'an', 'area', 'of'),
 ('is', 'an', 'area', 'of', 'computer'),
 ('an', 'area', 'of', 'computer', 'science'),
 ('area', 'of', 'computer', 'science', 'and'),
 ('of', 'computer', 'science', 'and', 'artificial'),
 ('computer', 'science', 'and', 'artificial', 'intelligence'),
 ('science', 'and', 'artificial', 'intelligence', 'concerned'),
 ('and', 'artificial', 'intelligence', 'concerned', 'with'),
 ('artificial', 'intelligence', 'concerned', 'with', 'the'),
 ('intelligence', 'concerned', 'with', 'the', 'interactions'),
 ('concerned', 'with', 'the', 'interactions', 'between'),
 ('with', 'the', 'interactions', 'between', 'computers'),
 ('the', 'interactions', 'between', 'computers', 'and'),
 ('interactions', 'between', 'computers', 'and', 'human'),
 ('between', 'computers', 'and', 'human', 'natural'),
 ('computers', 'and', 'human', 'na