# import Python Package

In [6]:
#!pip install nltk

In [7]:
import nltk

In [8]:
nltk.__version__

'3.8.1'

In [9]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#Punctiations removal

In [10]:
tweet= 'I have been feeling awful all days. I think I caught the flu from my brother'

In [11]:
import string

tweet_without_punc = tweet.translate(str.maketrans('', '', string.punctuation))
print(tweet_without_punc)

I have been feeling awful all days I think I caught the flu from my brother


# case Folding

In [12]:
tweet_case_fold=tweet_without_punc.lower()
print(tweet_case_fold)

i have been feeling awful all days i think i caught the flu from my brother


# Tokenization

In [13]:
tokens=word_tokenize(tweet_case_fold)
print(tokens)

['i', 'have', 'been', 'feeling', 'awful', 'all', 'days', 'i', 'think', 'i', 'caught', 'the', 'flu', 'from', 'my', 'brother']


# stop words reoval

In [14]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'its', 're', "you're", 'mustn', 'aren', 'weren', 'some', 'couldn', 'own', 'being', 'should', 'myself', "hadn't", 'than', 'most', 'don', 'there', 'same', "won't", 'yourself', 'will', 'her', 'very', 'how', 'i', 'me', 'what', 'that', 'are', 'a', 'on', 'doesn', 'won', 'further', 'only', 'each', 'he', "weren't", 'y', 'itself', 'the', "you'd", 'did', 'am', "mightn't", 'while', 'their', 'having', "should've", 'they', 'his', 'no', 'below', 'both', "isn't", 'and', 'off', 'by', 'in', 'over', 'yours', 'through', 'wasn', 'with', 'didn', 't', "aren't", 'hasn', 'about', 'herself', 'do', 'had', 'be', "wouldn't", "needn't", 'such', "you'll", 'which', 'before', 'from', 've', "she's", 'doing', 'under', "didn't", 'theirs', 'our', 'once', 's', 'himself', 'to', 'where', "don't", 'because', 'hers', 'll', 'does', 'as', 'we', 'themselves', "that'll", 'then', 'nor', 'my', 'whom', 'during', 'too', 'this', 'when', 'ourselves', 'more', 'an', 'were', 'can', 'yourselves', 'ain', 'if', 'again', 'not', 'all', 'up', 

In [16]:
tokens = [w for w in tokens if not w in stop_words]
print(tokens)

['feeling', 'awful', 'days', 'think', 'caught', 'flu', 'brother']


# Stemming

In [17]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stems = []
for t in tokens:
    stems.append(porter.stem(t))
print(stems)

['feel', 'aw', 'day', 'think', 'caught', 'flu', 'brother']


# Lemmatization

In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()

lematized_tokens = []
for t in tokens:
    lematized_tokens.append(lemmatizer.lemmatize(t))
print(lematized_tokens)

['feeling', 'awful', 'day', 'think', 'caught', 'flu', 'brother']


# Part Of Speech

In [20]:
import nltk
nltk.download('averaged_perceptron_tagger')
print(nltk.pos_tag(tokens))

[('feeling', 'VBG'), ('awful', 'JJ'), ('days', 'NNS'), ('think', 'VBP'), ('caught', 'VBN'), ('flu', 'NNS'), ('brother', 'NN')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Lemmatization using POS

In [21]:
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

punctuation = u",.?!()-_\"\'\\\n\r\t;:+*<>@#ยง^$%&|/"
stop_words_eng = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# A dictionary mapping part-of-speech tags to WordNet part-of-speech constants.
tag_dict = {"J": wn.ADJ,
            "N": wn.NOUN,
            "V": wn.VERB,
            "R": wn.ADV}

# Maps a part-of-speech tag (e.g., 'NN') to its WordNet equivalent.

def extract_wnpostag_from_postag(tag):
    #take the first letter of the tag
    #the second parameter is an "optional" in case of missing key in the dictionary

    return tag_dict.get(tag[0].upper(), None)



def lemmatize_tupla_word_postag(tupla):
    """
    giving a tupla of the form (wordString, posTagString) like ('guitar', 'NN'), return the lemmatized word
    """
    tag = extract_wnpostag_from_postag(tupla[1])
    return lemmatizer.lemmatize(tupla[0], tag) if tag is not None else tupla[0]
def bag_of_words(sentence, stop_words=None):
  # stop_words is optional
  # This checks if the stop_words parameter is provided. If not, it sets stop_words to the default set of English stop words (stop_words_eng).
    if stop_words is None:
        stop_words = stop_words_eng

    original_words = word_tokenize(sentence)

    tagged_words = nltk.pos_tag(original_words)

    # Memory Cleanup:
    original_words = None

    lemmatized_words = [ lemmatize_tupla_word_postag(ow) for ow in tagged_words ]

    # Memory Cleanup:
    tagged_words = None

    cleaned_words = [ w for w in lemmatized_words if (w not in punctuation) and (w not in stop_words) ]

    # Memory Cleanup:
    lemmatized_words = None
    return cleaned_words

In [22]:
bag_of_words(tweet)

['I', 'feel', 'awful', 'day', 'I', 'think', 'I', 'catch', 'flu', 'brother']