<a href="https://colab.research.google.com/github/Venushri/sam2/blob/main/NLP_Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PROJECT ON BUILDING A LIBRARY USING PYTHON

Step 1: Install and Import Dependencies

In [None]:
import nltk
import re
import spacy
import heapq
import string
import sentencepiece as spm

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import WhitespaceTokenizer, TweetTokenizer
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import SyllableTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
from collections import Counter
from transformers import AutoTokenizer
from tokenizers import ByteLevelBPETokenizer
from textblob import TextBlob

Step 2: Download necessary data for NLTK

In [None]:
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

Step 3: Define Sample Text

In [None]:
text = "Tokenization is an important step in NLP! It helps process text efficiently. Without tokenization, handling text would be much more difficult. Different tokenization methods serve different purposes."


# **Tokenisation**
Tokenization is the process of breaking down text into smaller components, typically words, phrases, or subwords, to facilitate analysis or processing by a machine.

a. Word Tokenization

In [None]:
word_tokens = word_tokenize(text)
print("Word Tokenization:", word_tokens)


Word Tokenization: ['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP', '!', 'It', 'helps', 'process', 'text', 'efficiently', '.', 'Without', 'tokenization', ',', 'handling', 'text', 'would', 'be', 'much', 'more', 'difficult', '.', 'Different', 'tokenization', 'methods', 'serve', 'different', 'purposes', '.']


 b. Sentence Tokenization

In [None]:
sentence_tokens = sent_tokenize(text)
print("Sentence Tokenization:", sentence_tokens)

Sentence Tokenization: ['Tokenization is an important step in NLP!', 'It helps process text efficiently.', 'Without tokenization, handling text would be much more difficult.', 'Different tokenization methods serve different purposes.']


c. Whitespace **Tokenization**

In [None]:
whitespace_tokens = WhitespaceTokenizer().tokenize(text)
print("Whitespace Tokenization:", whitespace_tokens)

Whitespace Tokenization: ['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP!', 'It', 'helps', 'process', 'text', 'efficiently.', 'Without', 'tokenization,', 'handling', 'text', 'would', 'be', 'much', 'more', 'difficult.', 'Different', 'tokenization', 'methods', 'serve', 'different', 'purposes.']


**d**. Character Tokenization

In [None]:
char_tokens = list(text)
print("Character Tokenization:", char_tokens)

Character Tokenization: ['T', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'i', 's', ' ', 'a', 'n', ' ', 'i', 'm', 'p', 'o', 'r', 't', 'a', 'n', 't', ' ', 's', 't', 'e', 'p', ' ', 'i', 'n', ' ', 'N', 'L', 'P', '!', ' ', 'I', 't', ' ', 'h', 'e', 'l', 'p', 's', ' ', 'p', 'r', 'o', 'c', 'e', 's', 's', ' ', 't', 'e', 'x', 't', ' ', 'e', 'f', 'f', 'i', 'c', 'i', 'e', 'n', 't', 'l', 'y', '.', ' ', 'W', 'i', 't', 'h', 'o', 'u', 't', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ',', ' ', 'h', 'a', 'n', 'd', 'l', 'i', 'n', 'g', ' ', 't', 'e', 'x', 't', ' ', 'w', 'o', 'u', 'l', 'd', ' ', 'b', 'e', ' ', 'm', 'u', 'c', 'h', ' ', 'm', 'o', 'r', 'e', ' ', 'd', 'i', 'f', 'f', 'i', 'c', 'u', 'l', 't', '.', ' ', 'D', 'i', 'f', 'f', 'e', 'r', 'e', 'n', 't', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'm', 'e', 't', 'h', 'o', 'd', 's', ' ', 's', 'e', 'r', 'v', 'e', ' ', 'd', 'i', 'f', 'f', 'e', 'r', 'e', 'n', 't', ' ', 'p', 'u', 'r', 'p', 'o', 

e. Subword Tokenization (Simulating Byte Pair Encoding using regex)

In [None]:
regex_tokens = regexp_tokenize(text, pattern='\w+|\$[\d\.]+|\S')
print("Regex-Based Tokenization:", regex_tokens)

Regex-Based Tokenization: ['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP', '!', 'It', 'helps', 'process', 'text', 'efficiently', '.', 'Without', 'tokenization', ',', 'handling', 'text', 'would', 'be', 'much', 'more', 'difficult', '.', 'Different', 'tokenization', 'methods', 'serve', 'different', 'purposes', '.']


 f. Tweet Tokenization (Handling social media text better)

In [None]:
text1 = "Hey @user! Check this out: https://example.com 😊 #NLP #AI"


In [None]:
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text1)
print("Tweet Tokenization:", tweet_tokens)

Tweet Tokenization: ['Hey', '@user', '!', 'Check', 'this', 'out', ':', 'https://example.com', '😊', '#NLP', '#AI']


# **Text Procesing**
 Text processing refers to the manipulation, transformation, and analysis of textual data to prepare it for further use in tasks like natural language processing (NLP), machine learning, and data analysis.

a. Convert to lowercase

In [None]:
text_lower = text.lower()
print("Lowercased Text:", text_lower)

Lowercased Text: tokenization is an important step in nlp! it helps process text efficiently. without tokenization, handling text would be much more difficult. different tokenization methods serve different purposes.


b. Remove punctuation

In [None]:
text_no_punct = text.translate(str.maketrans('', '', string.punctuation))
print("Text without Punctuation:", text_no_punct)

Text without Punctuation: Tokenization is an important step in NLP It helps process text efficiently Without tokenization handling text would be much more difficult Different tokenization methods serve different purposes


c. Remove stopwords

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokenize(text_no_punct) if word.lower() not in stop_words]
print("Text without Stopwords:", filtered_words)

Text without Stopwords: ['Tokenization', 'important', 'step', 'NLP', 'helps', 'process', 'text', 'efficiently', 'Without', 'tokenization', 'handling', 'text', 'would', 'much', 'difficult', 'Different', 'tokenization', 'methods', 'serve', 'different', 'purposes']


d. Stemming (reducing words to their root form)

In [None]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("Stemmed Words:", stemmed_words)

Stemmed Words: ['token', 'import', 'step', 'nlp', 'help', 'process', 'text', 'effici', 'without', 'token', 'handl', 'text', 'would', 'much', 'difficult', 'differ', 'token', 'method', 'serv', 'differ', 'purpos']


e. Lemmatization (getting base words, better than stemming)

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['Tokenization', 'important', 'step', 'NLP', 'help', 'process', 'text', 'efficiently', 'Without', 'tokenization', 'handling', 'text', 'would', 'much', 'difficult', 'Different', 'tokenization', 'method', 'serve', 'different', 'purpose']


# Frequency Analysis
Frequency analysis is a fundamental step in Natural Language Processing (NLP) and is useful for understanding the distribution of words, characters, or n-grams in a text dataset.

In [None]:
text = "Tokenization is an important step in NLP! It helps process text efficiently. Without tokenization, handling text would be much more difficult. Different tokenization methods serve different purposes."


In [None]:
def count_text_frequencies(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize into words
    words = text.split()

    # Count word frequencies
    word_freq = Counter(words)

    # Count character frequencies (excluding spaces)
    char_freq = Counter(text.replace(" ", ""))

    return word_freq, char_freq

word_counts, char_counts = count_text_frequencies(text)

print("\nWord Frequency:")
for word, freq in word_counts.items():
    print(f"{word}: {freq}")

print("\nCharacter Frequency:")
for char, freq in char_counts.items():
    print(f"{char}: {freq}")


Word Frequency:
tokenization: 3
is: 1
an: 1
important: 1
step: 1
in: 1
nlp: 1
it: 1
helps: 1
process: 1
text: 2
efficiently: 1
without: 1
handling: 1
would: 1
be: 1
much: 1
more: 1
difficult: 1
different: 2
methods: 1
serve: 1
purposes: 1

Character Frequency:
t: 21
o: 13
k: 3
e: 20
n: 15
i: 18
z: 3
a: 6
s: 9
m: 4
p: 7
r: 7
l: 6
h: 5
c: 4
x: 2
f: 8
y: 1
w: 2
u: 5
d: 6
g: 1
b: 1
v: 1


## Various NLP techniques that we can integrate into other projects.
This script implements various NLP techniques using nltk, spacy, and textblob. We can integrate these functions into other projects for text processing, sentiment analysis, and summarization.

1. Load Spacy Model

In [None]:
nlp = spacy.load("en_core_web_sm")

def tokenize_text(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return words, sentences

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    return [lemmatizer.lemmatize(word) for word in words]

def named_entity_recognition(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def sentiment_analysis(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

def text_summarization(text, num_sentences=3):
    sentences = sent_tokenize(text)
    word_frequencies = Counter(word_tokenize(text.lower()))
    max_freq = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] /= max_freq
    sentence_scores = {sent: sum(word_frequencies.get(word, 0) for word in word_tokenize(sent.lower())) for sent in sentences}
    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

def keyword_extraction(text, num_keywords=5):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stopwords.words('english')]
    word_freq = Counter(words)
    return [word for word, freq in word_freq.most_common(num_keywords)]


2. Example for Usage

In [None]:
if __name__ == "__main__":
    sample_text = ("In recent years, artificial intelligence has made significant advancements, "
                   "impacting various industries including healthcare, finance, and transportation. "
                   "Many companies are investing heavily in AI research to develop smarter systems. "
                   "For example, Tesla is working on self-driving cars, and OpenAI has been "
                   "developing language models like ChatGPT to enhance human-computer interactions. "
                   "Experts believe that AI will continue to evolve, bringing both opportunities and challenges.")
    sample_text2 = ("The project was a complete disaster,"
                    "filled with endless delays, mismanagement, and poor execution,"
                    "leading to frustration and disappointment among everyone involved.")

a. Output for a positve statement.

In [None]:
    print("Tokenization:", tokenize_text(sample_text))
    print("Lemmatization:", lemmatize_text(sample_text))
    print("NER:", named_entity_recognition(sample_text))
    print("Sentiment:", sentiment_analysis(sample_text))
    print("Summarization:", text_summarization(sample_text))
    print("Keywords:", keyword_extraction(sample_text))

Tokenization: (['In', 'recent', 'years', ',', 'artificial', 'intelligence', 'has', 'made', 'significant', 'advancements', ',', 'impacting', 'various', 'industries', 'including', 'healthcare', ',', 'finance', ',', 'and', 'transportation', '.', 'Many', 'companies', 'are', 'investing', 'heavily', 'in', 'AI', 'research', 'to', 'develop', 'smarter', 'systems', '.', 'For', 'example', ',', 'Tesla', 'is', 'working', 'on', 'self-driving', 'cars', ',', 'and', 'OpenAI', 'has', 'been', 'developing', 'language', 'models', 'like', 'ChatGPT', 'to', 'enhance', 'human-computer', 'interactions', '.', 'Experts', 'believe', 'that', 'AI', 'will', 'continue', 'to', 'evolve', ',', 'bringing', 'both', 'opportunities', 'and', 'challenges', '.'], ['In recent years, artificial intelligence has made significant advancements, impacting various industries including healthcare, finance, and transportation.', 'Many companies are investing heavily in AI research to develop smarter systems.', 'For example, Tesla is wor

b. Output for a negative statement.

In [None]:
    print("Tokenization:", tokenize_text(sample_text2))
    print("Lemmatization:", lemmatize_text(sample_text2))
    print("NER:", named_entity_recognition(sample_text2))
    print("Sentiment:", sentiment_analysis(sample_text2))
    print("Summarization:", text_summarization(sample_text2))
    print("Keywords:", keyword_extraction(sample_text2))

Tokenization: (['The', 'project', 'was', 'a', 'complete', 'disaster', ',', 'filled', 'with', 'endless', 'delays', ',', 'mismanagement', ',', 'and', 'poor', 'execution', ',', 'leading', 'to', 'frustration', 'and', 'disappointment', 'among', 'everyone', 'involved', '.'], ['The project was a complete disaster,filled with endless delays, mismanagement, and poor execution,leading to frustration and disappointment among everyone involved.'])
Lemmatization: ['The', 'project', 'wa', 'a', 'complete', 'disaster', ',', 'filled', 'with', 'endless', 'delay', ',', 'mismanagement', ',', 'and', 'poor', 'execution', ',', 'leading', 'to', 'frustration', 'and', 'disappointment', 'among', 'everyone', 'involved', '.']
NER: []
Sentiment: -0.25625
Summarization: The project was a complete disaster,filled with endless delays, mismanagement, and poor execution,leading to frustration and disappointment among everyone involved.
Keywords: ['project', 'complete', 'disaster', 'filled', 'endless']
