**Stemming**

In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
paragraph = """Thank you all so very much. Thank you to the Academy.Thank you to all of you in this room. I have to congratulate the other incredible 
nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, 
Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a t ranscendent cinematic experience. 
Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this 
would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly,I just want to say this: Making The Revenant was 
about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed 
to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat 
facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not 
speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged 
people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by 
the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. 
Thank you so very much."""

In [4]:
sentences=nltk.sent_tokenize(paragraph)

In [5]:
stemmer=PorterStemmer()

In [6]:
for i in range(len(sentences)):
  words=nltk.word_tokenize(sentences[i])
  words=[stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
  sentences[i]=' '.join(words)

In [7]:
sentences[0]

'thank much .'

**Lemmatization**

In [8]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [9]:
paragraph = """Thank you all so very much. Thank you to the Academy.Thank you to all of you in this room. I have to congratulate the other incredible 
nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, 
Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a t ranscendent cinematic experience. 
Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this 
would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly,I just want to say this: Making The Revenant was 
about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed 
to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat 
facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not 
speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged 
people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by 
the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. 
Thank you so very much."""

In [10]:
sentences=nltk.sent_tokenize(paragraph)

In [11]:
lemmatizer=WordNetLemmatizer()

In [12]:
for i in range(len(sentences)):
  words=nltk.word_tokenize(sentences[i])
  words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  sentences[i]=' '.join(words)

**Bag of Words**

In [13]:
import nltk

In [14]:
paragraph = """Thank you all so very much. Thank you to the Academy.Thank you to all of you in this room. I have to congratulate the other incredible 
nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, 
Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a t ranscendent cinematic experience. 
Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this 
would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly,I just want to say this: Making The Revenant was 
about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed 
to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat 
facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not 
speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged 
people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by 
the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. 
Thank you so very much."""

In [15]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [16]:
stemmer=PorterStemmer()

In [17]:
lemmatizer=WordNetLemmatizer()

In [18]:
sentences=nltk.sent_tokenize(paragraph)

In [19]:
corpus=[]

In [20]:
for i in range(len(sentences)):
  review=re.sub('[^a-zA-Z]',' ',sentences[i])
  review.lower()
  review.split()
  review=[stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
  review=' '.join(review)
  corpus.append(review)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X=cv.fit_transform(corpus).toarray()

**TF-IDF**

In [22]:
import nltk

In [23]:
paragraph = """Thank you all so very much. Thank you to the Academy.Thank you to all of you in this room. I have to congratulate the other incredible 
nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, 
Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a t ranscendent cinematic experience. 
Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this 
would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly,I just want to say this: Making The Revenant was 
about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed 
to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat 
facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not 
speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged 
people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by 
the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. 
Thank you so very much."""

In [24]:
import re
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords

In [25]:
sentences=nltk.sent_tokenize(paragraph)

In [26]:
ps=PorterStemmer()
wordnet=WordNetLemmatizer()

In [27]:
corpus=[]

In [28]:
for i in range(len(sentences)):
  review=re.sub('[^a-zA-Z]',' ',sentences[i])
  review=review.lower()
  review=review.split()
  review=[ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
  review=' '.join(review)
  corpus.append(review)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [30]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.54431266, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.45255581],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])