<a href="https://colab.research.google.com/github/SaiMithunPunna/AIDS/blob/main/NLP_Intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk

In [None]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import spacy
from nltk.stem import PorterStemmer , WordNetLemmatizer
from nltk.tokenize import sent_tokenize , word_tokenize

In [None]:
nlp=spacy.load("en_core_web_sm")

In [None]:
text="John works at Google in California. He loves Programming and playing football"

In [None]:
# 1. Segmentation (sentence tokenization)
sentences = sent_tokenize(text)
print("Segmentation:", sentences)

Segmentation: ['John works at Google in California.', 'He loves Programming and playing football']


In [None]:
# 2. Tokenization (word tokenization)
tokens = [word_tokenize(sentence) for sentence in sentences]
print("Tokenization:", tokens)

Tokenization: [['John', 'works', 'at', 'Google', 'in', 'California', '.'], ['He', 'loves', 'Programming', 'and', 'playing', 'football']]


In [None]:
# 3. Stemming
stemmer = PorterStemmer()
stemmed_tokens = [[stemmer.stem(token) for token in sentence] for sentence in tokens]
print("Stemming:", stemmed_tokens)

Stemming: [['john', 'work', 'at', 'googl', 'in', 'california', '.'], ['he', 'love', 'program', 'and', 'play', 'footbal']]


In [None]:
# 4. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [[lemmatizer.lemmatize(token) for token in sentence] for sentence in tokens]
print("Lemmatization:", lemmatized_tokens)

Lemmatization: [['John', 'work', 'at', 'Google', 'in', 'California', '.'], ['He', 'love', 'Programming', 'and', 'playing', 'football']]


In [None]:
# 5. POS Tagging
doc = nlp(text)
pos_tags = [(token.text, token.pos_) for token in doc]
print("POS Tagging:", pos_tags)

POS Tagging: [('John', 'PROPN'), ('works', 'VERB'), ('at', 'ADP'), ('Google', 'PROPN'), ('in', 'ADP'), ('California', 'PROPN'), ('.', 'PUNCT'), ('He', 'PRON'), ('loves', 'VERB'), ('Programming', 'PROPN'), ('and', 'CCONJ'), ('playing', 'VERB'), ('football', 'NOUN')]


In [None]:
# 6. Named Entity Recognition (NER)
entities = [(entity.text, entity.label_) for entity in doc.ents]
print("Named Entities:", entities)

Named Entities: [('John', 'PERSON'), ('Google', 'ORG'), ('California', 'GPE'), ('Programming', 'ORG')]


In [None]:
# 7. Parsing (Dependency Parsing)
for sent in doc.sents:
    for token in sent:
        print(f'{token.text:10} -> {token.dep_:10} -> {token.head.text}')


John       -> nsubj      -> works
works      -> ROOT       -> works
at         -> prep       -> works
Google     -> pobj       -> at
in         -> prep       -> works
California -> pobj       -> in
.          -> punct      -> works
He         -> nsubj      -> loves
loves      -> ROOT       -> loves
Programming -> dobj       -> loves
and        -> cc         -> Programming
playing    -> conj       -> Programming
football   -> dobj       -> playing
