# Assignment 1 
----

### Installing NLTK and downloading Datasets

In [10]:
!pip install nltk

#Downloading datasets
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Tokenizing a paragraph


In [11]:
#Importing Sentence and Work Tokenizer library
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [12]:
text = 'With the ongoing pandemic and its consequence impact on all aspects of our lives, a major change in our daily functioning is in terms of our work routine. With the implementation of the lockdown, a lot of our working routine shifted to a mode of working from home. Depending on the nature of the work or profession, the home environment, the individual’s flexibility in adjusting working style, and many other factors, such changes are likely to have had a diverse range of ramifications, not just on the working professional, but on the work, employers as well as other family members involved in the situation.'

In [13]:
#Tokenizing to sentences
sentences = sent_tokenize(text)
sentences

['With the ongoing pandemic and its consequence impact on all aspects of our lives, a major change in our daily functioning is in terms of our work routine.',
 'With the implementation of the lockdown, a lot of our working routine shifted to a mode of working from home.',
 'Depending on the nature of the work or profession, the home environment, the individual’s flexibility in adjusting working style, and many other factors, such changes are likely to have had a diverse range of ramifications, not just on the working professional, but on the work, employers as well as other family members involved in the situation.']

In [14]:
#Tokening each sentence to word
words = []

for i in range(len(sentences)):
  words.append(word_tokenize(sentences[i]))
words

[['With',
  'the',
  'ongoing',
  'pandemic',
  'and',
  'its',
  'consequence',
  'impact',
  'on',
  'all',
  'aspects',
  'of',
  'our',
  'lives',
  ',',
  'a',
  'major',
  'change',
  'in',
  'our',
  'daily',
  'functioning',
  'is',
  'in',
  'terms',
  'of',
  'our',
  'work',
  'routine',
  '.'],
 ['With',
  'the',
  'implementation',
  'of',
  'the',
  'lockdown',
  ',',
  'a',
  'lot',
  'of',
  'our',
  'working',
  'routine',
  'shifted',
  'to',
  'a',
  'mode',
  'of',
  'working',
  'from',
  'home',
  '.'],
 ['Depending',
  'on',
  'the',
  'nature',
  'of',
  'the',
  'work',
  'or',
  'profession',
  ',',
  'the',
  'home',
  'environment',
  ',',
  'the',
  'individual',
  '’',
  's',
  'flexibility',
  'in',
  'adjusting',
  'working',
  'style',
  ',',
  'and',
  'many',
  'other',
  'factors',
  ',',
  'such',
  'changes',
  'are',
  'likely',
  'to',
  'have',
  'had',
  'a',
  'diverse',
  'range',
  'of',
  'ramifications',
  ',',
  'not',
  'just',
  'on',
 

Now, we have stored the tokenized words in a 2D array.

### Removing Punctations and Stopwords

In [15]:
#StopWord List
nltk.download('stopwords')

import string
from nltk.corpus import stopwords
eng_sw = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
filtered_words = []

#Looping word array
for i in range(len(words)):
  filtered_words.append([])
  for j in range(len(words[i])):

    word = words[i][j]

    #Check for punctuation and stopword
    if word not in string.punctuation:
      if word not in eng_sw:
        filtered_words[i].append(word)

filtered_words

[['With',
  'ongoing',
  'pandemic',
  'consequence',
  'impact',
  'aspects',
  'lives',
  'major',
  'change',
  'daily',
  'functioning',
  'terms',
  'work',
  'routine'],
 ['With',
  'implementation',
  'lockdown',
  'lot',
  'working',
  'routine',
  'shifted',
  'mode',
  'working',
  'home'],
 ['Depending',
  'nature',
  'work',
  'profession',
  'home',
  'environment',
  'individual',
  '’',
  'flexibility',
  'adjusting',
  'working',
  'style',
  'many',
  'factors',
  'changes',
  'likely',
  'diverse',
  'range',
  'ramifications',
  'working',
  'professional',
  'work',
  'employers',
  'well',
  'family',
  'members',
  'involved',
  'situation']]

## Lemmatizing and Stemming the words

-----

In [None]:
from tabulate import tabulate

In [None]:
# Lemmatization

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

#Store the Original and Lemmatized Word Pair for each sentence
lem_res = []

#Lemmatization
for i in range(len(filtered_words)):
  lem_res.append([]);
  for j in range(len(filtered_words[i])):
    lem_res[i].append([filtered_words[i][j],wnl.lemmatize(filtered_words[i][j])])

#Compare the Original and Lemmatized Output
for i in range(len(lem_res)):
  print("Sentence {}".format(i+1))
  print(tabulate(lem_res[i],headers=['Original','Lemmatized'],tablefmt='orgtbl'))
  print("\n\n")


Sentence 1
| Original    | Lemmatized   |
|-------------+--------------|
| With        | With         |
| ongoing     | ongoing      |
| pandemic    | pandemic     |
| consequence | consequence  |
| impact      | impact       |
| aspects     | aspect       |
| lives       | life         |
| major       | major        |
| change      | change       |
| daily       | daily        |
| functioning | functioning  |
| terms       | term         |
| work        | work         |
| routine     | routine      |



Sentence 2
| Original       | Lemmatized     |
|----------------+----------------|
| With           | With           |
| implementation | implementation |
| lockdown       | lockdown       |
| lot            | lot            |
| working        | working        |
| routine        | routine        |
| shifted        | shifted        |
| mode           | mode           |
| working        | working        |
| home           | home           |



Sentence 3
| Original      | Lemmatized   |


In [None]:
# Stemming

from nltk.stem import PorterStemmer
ps = PorterStemmer()

#Store the Original and Stemmed Word Pair for each sentence
stem_res = []

#Stemming
for i in range(len(filtered_words)):
  stem_res.append([]);
  for j in range(len(filtered_words[i])):
    stem_res[i].append([filtered_words[i][j],ps.stem(filtered_words[i][j])])

#Compare the Original and Lemmatized Output
for i in range(len(stem_res)):
  print("Sentence {}".format(i+1))
  print(tabulate(stem_res[i],headers=['Original','Stemmed'],tablefmt='orgtbl'))
  print("\n\n")



Sentence 1
| Original    | Stemmed   |
|-------------+-----------|
| With        | with      |
| ongoing     | ongo      |
| pandemic    | pandem    |
| consequence | consequ   |
| impact      | impact    |
| aspects     | aspect    |
| lives       | live      |
| major       | major     |
| change      | chang     |
| daily       | daili     |
| functioning | function  |
| terms       | term      |
| work        | work      |
| routine     | routin    |



Sentence 2
| Original       | Stemmed   |
|----------------+-----------|
| With           | with      |
| implementation | implement |
| lockdown       | lockdown  |
| lot            | lot       |
| working        | work      |
| routine        | routin    |
| shifted        | shift     |
| mode           | mode      |
| working        | work      |
| home           | home      |



Sentence 3
| Original      | Stemmed    |
|---------------+------------|
| Depending     | depend     |
| nature        | natur      |
| work          |