## Datasets

In [None]:
#pip install nltk
#DOWNLOAD THE DATASET
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')



# punkt:


# Punk Example to set identify and seperate the sentences
# Purpose: punkt is a tokenizer that splits text into sentences and words.
# Use Case: It’s used for sentence splitting and word tokenization, essential steps in text preprocessing.
from nltk.tokenize import word_tokenize, sent_tokenize
text = "Hello! How are you? I am fine."
print(sent_tokenize(text))  # Output: ['Hello!', 'How are you?', 'I am fine.']
print(word_tokenize(text))  # Output: ['Hello', '!', 'How', 'are', 'you', '?', 'I', 'am', 'fine', '.']



# averaged_perceptron_tagger:


# Purpose: This is a part-of-speech (POS) tagger.
# Use Case: It tags words in a sentence with their respective parts of speech (e.g., noun, verb, adjective).
from nltk import pos_tag
from nltk.tokenize import word_tokenize
words = word_tokenize("Natural language processing is fascinating.")
print(pos_tag(words))  # Output: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('fascinating', 'VBG'), ('.', '.')]


# maxent_ne_chunker:


# Purpose: This is used for Named Entity Recognition (NER).
# Use Case: It identifies named entities (e.g., person names, organizations, locations) in text.
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
words = word_tokenize("Amit is looking Good.")
pos_tags = pos_tag(words)
print(ne_chunk(pos_tags))  # Output: (S Apple/NNP is/VBZ looking/VBG at/IN buying/VBG U.K./NNP startup/NN for/IN $/$ 1/CD billion/CD ./.)


# words:

# Purpose: This dataset contains a list of English words.
# Use Case: It’s often used for spell checking, word validation, or other linguistic tasks.

from nltk.corpus import words
print(words.words()[:10])  # Output: ['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', 'Aaron']


# vader_lexicon:
#
# Purpose: VADER (Valence Aware Dictionary and sEntiment Reasoner) is a sentiment analysis tool.
# Use Case: It’s used for sentiment analysis, particularly in social media text where emojis, slang, and punctuation are important.

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
text = "NLTK is a fantastic library!"
print(sid.polarity_scores(text))  # Output: {'neg': 0.0, 'neu': 0.245, 'pos': 0.755, 'compound': 0.7023}



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


['Hello!', 'How are you?', 'I am fine.']
['Hello', '!', 'How', 'are', 'you', '?', 'I', 'am', 'fine', '.']
[('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('fascinating', 'VBG'), ('.', '.')]
(S (GPE Amit/NNP) is/VBZ looking/VBG (PERSON Good/NNP) ./.)
['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', 'Aaron']
{'neg': 0.0, 'neu': 0.435, 'pos': 0.565, 'compound': 0.5983}


# Tokenization ........................................................

In [None]:
#Tokenization is the process of splitting text into individual words or sentences.
# his line imports two functions, word_tokenize and sent_tokenize, from the nltk.tokenize module.
# These functions are used for tokenizing text into words and sentences, respectively.

from nltk.tokenize import word_tokenize, sent_tokenize

text = "Hello, how are you? I hope you are doing well. NLP is fun!"

# Sentence Tokenization
# sent_tokenize(text): This function splits the input text into individual sentences based on punctuation marks
# (like periods, exclamation marks, and question marks).
# Example:
# Input text: "Hello, how are you? I hope you are doing well. NLP is fun!"
# Output sentences: ['Hello, how are you?', 'I hope you are doing well.', 'NLP is fun!']
# The sentences list contains each sentence as a separate string.

sentences = sent_tokenize(text)
print("Sentences:", sentences)



# word_tokenize(text): This function splits the input text into individual words and punctuation marks.
# Example:
# Input text: "Hello, how are you? I hope you are doing well. NLP is fun!"
# Output words: ['Hello', ',', 'how', 'are', 'you', '?', 'I', 'hope', 'you', 'are', 'doing', 'well', '.', 'NLP', 'is', 'fun', '!']
# The words list contains each word and punctuation mark as a separate string element.
# Word Tokenization
words = word_tokenize(text)
print("Words:", words)




# POS Tagging .........................................................................

In [None]:
#Part-of-Speech (POS) tagging is the process of marking up a word in a text as corresponding to a particular part of speech.
# This imports the necessary functions for tokenization (word_tokenize) and part-of-speech tagging (pos_tag) from NLTK (Natural Language Toolkit).
# POS tagging assigns a specific part-of-speech tag to each word in the input text.
# python

from nltk.tokenize import word_tokenize
from nltk import pos_tag

text = "Hello, how are you? I hope you are doing well. NLP is fun!"
words = word_tokenize(text)

# POS Tagging
pos_tags = pos_tag(words)
print("POS Tags:", pos_tags)


# Name Entity Tagging

In [None]:

#Named Entity Recognition is the process of identifying and categorizing key information (entities) in text.
# This imports the necessary NLTK modules and functions for Named Entity Recognition (NER),
# including ne_chunk for performing NER, word_tokenize for tokenizing words, and pos_tag for part-of-speech tagging.

from nltk import ne_chunk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

text = "Barack Obama was born in Hawaii. He was elected president in 2008."

# word_tokenize(text): Tokenizes the input text into individual words and punctuation marks.
# Example:
# Input text: "Barack Obama was born in Hawaii. He was elected president in 2008."
# Output words: ['Barack', 'Obama', 'was', 'born',  '.']

# pos_tag(words): Assigns part-of-speech tags to each tokenized word.
# Example (for the tokenized words above):
# Output POS tags: [('Barack', 'NNP'), ('Obama', 'NNP'),  ('.', '.')]
# Each tuple pairs a word/token with its corresponding part-of-speech tag.


words = word_tokenize(text)
pos_tags = pos_tag(words)

# Named Entity Recognition
# ne_chunk(pos_tags): Applies Named Entity Recognition to the part-of-speech tagged words.
# Example (based on the POS tags above):
# Output named entities: Tree('S', [Tree('PERSON', [('Barack', 'NNP'), ('Obama', 'NNP')]), ('was', 'VBD'), ('born', 'VBN'), ('in', 'IN'),....
# ne_chunk organizes recognized entities into a nested tree structure (Tree objects) where entities like persons (PERSON) and
# geopolitical entities (GPE) are labeled accordingly.

named_entities = ne_chunk(pos_tags)
print("Named Entities:", named_entities)


# Sentiment Analysis

In [None]:
#Sentiment analysis is the process of determining the sentiment or emotion expressed in a text.
# This line imports SentimentIntensityAnalyzer from NLTK('s sentiment module. '
# SentimentIntensityAnalyzer is a pre-built tool in NLTK that helps analyze and quantify the sentiment expressed in a piece of text.)

from nltk.sentiment import SentimentIntensityAnalyzer

text = "I love this product! It's amazing."

# Creates an instance of SentimentIntensityAnalyzer. This initializes the sentiment analyzer object that will be used to analyze the sentiment of the text.
sia = SentimentIntensityAnalyzer()

# sia.polarity_scores(text): Analyzes the sentiment of the input text and returns a dictionary of sentiment scores.
# Example:
# Input text: "I love this product! It's amazing."
# Output sentiment scores: {'neg': 0.0, 'neu': 0.297, 'pos': 0.703, 'compound': 0.7351}
sentiment = sia.polarity_scores(text)
print("Sentiment:", sentiment)


# NLP ALl in One

In [None]:
#Merged together
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, ne_chunk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

# Sample text
text = "Barack Obama was born in Hawaii. He was elected president in 2008. I love this product! It's amazing."

# Sentence Tokenization
sentences = sent_tokenize(text)
print("Sentences:", sentences)
print()

# Word Tokenization
words = word_tokenize(text)
print("Words:", words)
print()

# POS Tagging
pos_tags = pos_tag(words)
print("POS Tags:", pos_tags)
print()


# Named Entity Recognition
named_entities = ne_chunk(pos_tags)
print("Named Entities:", named_entities)
print()


# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
sentiment = sia.polarity_scores(text)
print("Sentiment:", sentiment)
print()


Sentences: ['Barack Obama was born in Hawaii.', 'He was elected president in 2008.', 'I love this product!', "It's amazing."]

Words: ['Barack', 'Obama', 'was', 'born', 'in', 'Hawaii', '.', 'He', 'was', 'elected', 'president', 'in', '2008', '.', 'I', 'love', 'this', 'product', '!', 'It', "'s", 'amazing', '.']

POS Tags: [('Barack', 'NNP'), ('Obama', 'NNP'), ('was', 'VBD'), ('born', 'VBN'), ('in', 'IN'), ('Hawaii', 'NNP'), ('.', '.'), ('He', 'PRP'), ('was', 'VBD'), ('elected', 'VBN'), ('president', 'NN'), ('in', 'IN'), ('2008', 'CD'), ('.', '.'), ('I', 'PRP'), ('love', 'VBP'), ('this', 'DT'), ('product', 'NN'), ('!', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('amazing', 'JJ'), ('.', '.')]

Named Entities: (S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP)
  ./.
  He/PRP
  was/VBD
  elected/VBN
  president/NN
  in/IN
  2008/CD
  ./.
  I/PRP
  love/VBP
  this/DT
  product/NN
  !/.
  It/PRP
  's/VBZ
  amazing/JJ
  ./.)

Sentiment: {'neg': 0.0, 'neu': 0.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
