## Text Preprocessing 

In [1]:
import nltk

In [2]:
p="""Global warming is a phenomenon where the earth’s average temperature rises up due to increased amounts of greenhouse gases. Greenhouse gases such as carbon dioxide, methane and ozone trap the incoming radiation from the sun. This effect creates a natural “blanket” which prevents the heat from escaping back into the atmosphere. This effect is called a greenhouse effect.Contrary to popular belief, greenhouse gases are not inherently bad.  In fact, the greenhouse effect is quite important for life on earth. Without this effect, the sun’s radiation would be reflected back into the atmosphere, freezing the surface and making life impossible. However, when greenhouse gasses in excess amounts get trapped, serious repercussions begin to appear. The polar ice caps begin to melt, leading to the rise in sea levels. Furthermore, the greenhouse effect is accelerated when polar ice caps and sea ice melts. This is due to the fact the ice reflects 50% to 70% of the sun’s rays back into space; but without ice, the solar radiation gets absorbed. Seawater reflects only 6% of the sun’s radiation back into space. What’s more frightening is the fact that the poles contain large amounts of carbon dioxide trapped within the ice. If this ice melts, it will significantly contribute to global warming."""

## Tokenization 

### sentence tokenization

In [3]:
sentence1=nltk.sent_tokenize(p)

In [4]:
sentence1

['Global warming is a phenomenon where the earth’s average temperature rises up due to increased amounts of greenhouse gases.',
 'Greenhouse gases such as carbon dioxide, methane and ozone trap the incoming radiation from the sun.',
 'This effect creates a natural “blanket” which prevents the heat from escaping back into the atmosphere.',
 'This effect is called a greenhouse effect.Contrary to popular belief, greenhouse gases are not inherently bad.',
 'In fact, the greenhouse effect is quite important for life on earth.',
 'Without this effect, the sun’s radiation would be reflected back into the atmosphere, freezing the surface and making life impossible.',
 'However, when greenhouse gasses in excess amounts get trapped, serious repercussions begin to appear.',
 'The polar ice caps begin to melt, leading to the rise in sea levels.',
 'Furthermore, the greenhouse effect is accelerated when polar ice caps and sea ice melts.',
 'This is due to the fact the ice reflects 50% to 70% of the

In [5]:
len(sentence1)

13

### word tokenization

In [6]:
word=nltk.word_tokenize(p)
word

['Global',
 'warming',
 'is',
 'a',
 'phenomenon',
 'where',
 'the',
 'earth',
 '’',
 's',
 'average',
 'temperature',
 'rises',
 'up',
 'due',
 'to',
 'increased',
 'amounts',
 'of',
 'greenhouse',
 'gases',
 '.',
 'Greenhouse',
 'gases',
 'such',
 'as',
 'carbon',
 'dioxide',
 ',',
 'methane',
 'and',
 'ozone',
 'trap',
 'the',
 'incoming',
 'radiation',
 'from',
 'the',
 'sun',
 '.',
 'This',
 'effect',
 'creates',
 'a',
 'natural',
 '“',
 'blanket',
 '”',
 'which',
 'prevents',
 'the',
 'heat',
 'from',
 'escaping',
 'back',
 'into',
 'the',
 'atmosphere',
 '.',
 'This',
 'effect',
 'is',
 'called',
 'a',
 'greenhouse',
 'effect.Contrary',
 'to',
 'popular',
 'belief',
 ',',
 'greenhouse',
 'gases',
 'are',
 'not',
 'inherently',
 'bad',
 '.',
 'In',
 'fact',
 ',',
 'the',
 'greenhouse',
 'effect',
 'is',
 'quite',
 'important',
 'for',
 'life',
 'on',
 'earth',
 '.',
 'Without',
 'this',
 'effect',
 ',',
 'the',
 'sun',
 '’',
 's',
 'radiation',
 'would',
 'be',
 'reflected',
 'ba

In [7]:
len(word)

249

### using regular expression

In [8]:
import re

In [9]:
s=r"[?.]"
re.split(s,p)

['Global warming is a phenomenon where the earth’s average temperature rises up due to increased amounts of greenhouse gases',
 ' Greenhouse gases such as carbon dioxide, methane and ozone trap the incoming radiation from the sun',
 ' This effect creates a natural “blanket” which prevents the heat from escaping back into the atmosphere',
 ' This effect is called a greenhouse effect',
 'Contrary to popular belief, greenhouse gases are not inherently bad',
 '  In fact, the greenhouse effect is quite important for life on earth',
 ' Without this effect, the sun’s radiation would be reflected back into the atmosphere, freezing the surface and making life impossible',
 ' However, when greenhouse gasses in excess amounts get trapped, serious repercussions begin to appear',
 ' The polar ice caps begin to melt, leading to the rise in sea levels',
 ' Furthermore, the greenhouse effect is accelerated when polar ice caps and sea ice melts',
 ' This is due to the fact the ice reflects 50% to 70% o

In [10]:
pattern=r"[A-Z]\w+"
re.findall(pattern,p)

['Global',
 'Greenhouse',
 'This',
 'This',
 'Contrary',
 'In',
 'Without',
 'However',
 'The',
 'Furthermore',
 'This',
 'Seawater',
 'What',
 'If']

In [11]:
r="Hello! How are you? Have a Great Day."
re.sub("Hello","Hi",r)

'Hi! How are you? Have a Great Day.'

## Stemming

In [12]:
from nltk.stem import PorterStemmer

In [13]:
stemmer=PorterStemmer()

In [14]:
stemmer.stem("playing")

'play'

In [15]:
stemmer.stem("working")

'work'

In [16]:
stemmer.stem("freezing")

'freez'

In [17]:
for i in range(len(sentence1)):
    words=nltk.word_tokenize(sentence1[i])
    words=[stemmer.stem(word) for word in words]
    sentence1[i]=" ".join(words)
sentence1

['global warm is a phenomenon where the earth ’ s averag temperatur rise up due to increas amount of greenhous gase .',
 'greenhous gase such as carbon dioxid , methan and ozon trap the incom radiat from the sun .',
 'thi effect creat a natur “ blanket ” which prevent the heat from escap back into the atmospher .',
 'thi effect is call a greenhous effect.contrari to popular belief , greenhous gase are not inher bad .',
 'In fact , the greenhous effect is quit import for life on earth .',
 'without thi effect , the sun ’ s radiat would be reflect back into the atmospher , freez the surfac and make life imposs .',
 'howev , when greenhous gass in excess amount get trap , seriou repercuss begin to appear .',
 'the polar ice cap begin to melt , lead to the rise in sea level .',
 'furthermor , the greenhous effect is acceler when polar ice cap and sea ice melt .',
 'thi is due to the fact the ice reflect 50 % to 70 % of the sun ’ s ray back into space ; but without ice , the solar radiat ge

### Stopwords

In [18]:
from nltk.corpus import stopwords

In [19]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [20]:
Sent1="Hello! What are you doing? Nice to meet you."
words=nltk.word_tokenize(Sent1)
for i in words:
    if i not in stopwords.words("english"):
        print(i)

Hello
!
What
?
Nice
meet
.


### Part of Speech tag  

In [21]:
from nltk import pos_tag

In [22]:
pos_tag(words)

[('Hello', 'NN'),
 ('!', '.'),
 ('What', 'WP'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('doing', 'VBG'),
 ('?', '.'),
 ('Nice', 'NNP'),
 ('to', 'TO'),
 ('meet', 'VB'),
 ('you', 'PRP'),
 ('.', '.')]

# Lemmatization

In [23]:
from nltk.stem import WordNetLemmatizer

In [24]:
lemm=WordNetLemmatizer()

In [25]:
lemm.lemmatize("running",pos="v")

'run'

## N-grams

In [92]:
from nltk import ngrams

In [93]:
sentence="This is an example"
words=nltk.word_tokenize(sentence)

In [28]:
for i in ngrams(words,3):    # tri grams example   n=3 
    print(i)

('This', 'is', 'an')
('is', 'an', 'example')


### Creating Vectors from Text

## Bag of Words
<br>
creates a set of vectors containing the count of word occurrences in the document

In [73]:
p1="Hello, This is line one one. This is second line. This is example of Bag of words."

In [74]:
sentence1=nltk.sent_tokenize(p1)

In [75]:
data=[]
for i in range(len(sentence1)):
    new=re.sub('[^a-zA-Z]'," ",sentence1[i])
    new=new.lower()
    new=new.split()
    new=[i for i in new if i not in set(stopwords.words('english'))]
    new=" ".join(new)
    data.append(new)
data

['hello line one one', 'second line', 'example bag words']

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [77]:
r=cv.fit_transform(data).toarray()

In [78]:
import pandas as pd

In [79]:
r=pd.DataFrame(r)

In [80]:
r

Unnamed: 0,0,1,2,3,4,5,6
0,0,0,1,1,2,0,0
1,0,0,0,1,0,1,0
2,1,1,0,0,0,0,1


### Drawback of BoW is that it convert to data where all features have equal importance and TF-IDF is used to overcome this problem

## TF-IDF  Term Frequency-Inverse Document Frequency

tf=no. of particular word in sentence / Total number of words in sentence <br>
idf=no. of sentences / no. of sentences containing particular word <br>
resultant tfidf = tf * idf

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [88]:
tfidf=TfidfVectorizer()

In [89]:
z=tfidf.fit_transform(data).toarray()

In [90]:
z=pd.DataFrame(z)
z

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.423394,0.322002,0.846789,0.0,0.0
1,0.0,0.0,0.0,0.605349,0.0,0.795961,0.0
2,0.57735,0.57735,0.0,0.0,0.0,0.0,0.57735
