In [None]:
# Practical:-
# Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK library.
# Use porter stemmer and snowball stemmer for stemming. Use any technique for lemmatization.

import nltk
from nltk.tokenize import WhitespaceTokenizer, word_tokenize, TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text for tokenization
text = "NLTK is a powerful Python library for natural language processing."

# Tokenization using Whitespace Tokenizer
whitespace_tokens = WhitespaceTokenizer().tokenize(text)
print("Whitespace Tokenization:", whitespace_tokens)

# Tokenization using Punctuation-based Tokenizer
punct_tokens = word_tokenize(text)
print("Punctuation-based Tokenization:", punct_tokens)

# Tokenization using Treebank Tokenizer
treebank_tokens = TreebankWordTokenizer().tokenize(text)
print("Treebank Tokenization:", treebank_tokens)

# Tokenization using Tweet Tokenizer
tweet_tokens = TweetTokenizer().tokenize(text)
print("Tweet Tokenization:", tweet_tokens)

# Multi-Word Expression (MWE)
mwe = ["natural language processing", "Python library"]
mwe_tokenizer = nltk.tokenize.MWETokenizer(mwe)
mwe_tokens = mwe_tokenizer.tokenize(text.split())
print("MWE Tokenization:", mwe_tokens)

# Stemming using Porter Stemmer
porter_stemmer = PorterStemmer()
porter_stems = [porter_stemmer.stem(token) for token in punct_tokens]
print("Porter Stemming:", porter_stems)

# Stemming using Snowball Stemmer
snowball_stemmer = SnowballStemmer(language='english')
snowball_stems = [snowball_stemmer.stem(token) for token in punct_tokens]
print("Snowball Stemming:", snowball_stems)

# Lemmatization using WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in punct_tokens]
print("Lemmatization:", lemmas)


Whitespace Tokenization: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing.']
Punctuation-based Tokenization: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing', '.']
Treebank Tokenization: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing', '.']
Tweet Tokenization: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing', '.']
MWE Tokenization: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing.']
Porter Stemming: ['nltk', 'is', 'a', 'power', 'python', 'librari', 'for', 'natur', 'languag', 'process', '.']
Snowball Stemming: ['nltk', 'is', 'a', 'power', 'python', 'librari', 'for', 'natur', 'languag', 'process', '.']
Lemmatization: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Load the data
spam_data = pd.read_csv('/content/spam.csv')
spam_data.head()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(spam_data['v2'], spam_data['v1'], test_size=0.2,random_state=42)

X_train

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


# Train the model
model = MultinomialNB()
model.fit(X_train_vect, y_train)


# Use the model to classify new messages
new_messages = ['you won Free money!!!', 'Hey, how are you doing today?']
new_messages_vect = vectorizer.transform(new_messages)
predictions = model.predict(new_messages_vect)
print('Predictions:', predictions)

# Evaluate the model on the test data
accuracy = model.score(X_test_vect, y_test)
print('Accuracy:', accuracy)

Predictions: ['spam' 'ham']
Accuracy: 0.9838565022421525


In [None]:
!pip install textblob


from textblob import TextBlob

def sentiment(polarity):
    if blob.sentiment.polarity < 0:
        print("Negative")
    elif blob.sentiment.polarity > 0:
        print("Positive")
    else:
        print("Neutral")

blob = TextBlob("The movie was excellent!")
print(blob.sentiment)
sentiment(blob.sentiment.polarity)

blob = TextBlob("The movie was not bad.")
print(blob.sentiment)
sentiment(blob.sentiment.polarity)

blob = TextBlob("The movie was ridiculous.")
print(blob.sentiment)
sentiment(blob.sentiment.polarity)


Sentiment(polarity=1.0, subjectivity=1.0)
Positive
Sentiment(polarity=0.3499999999999999, subjectivity=0.6666666666666666)
Positive
Sentiment(polarity=-0.3333333333333333, subjectivity=1.0)
Negative


In [None]:
# Practical:- Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on data. Create embeddings using Word2Vec.

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import string

# Sample data
data = [
    "Natural language processing is a field of study.",
    "It involves the interaction between computers and humans.",
    "NLP techniques are used for text analysis and understanding.",
    "Word embeddings are a key component of NLP models."
]

# Tokenization and preprocessing
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Bag-of-Words (Count occurrence)
count_vectorizer = CountVectorizer(tokenizer=preprocess_text)
bow_matrix = count_vectorizer.fit_transform(data)

print("Bag-of-Words (Count occurrence):")
print(bow_matrix.toarray())

# Bag-of-Words (Normalized count occurrence)
norm_count_vectorizer = CountVectorizer(tokenizer=preprocess_text, binary=False)
norm_bow_matrix = norm_count_vectorizer.fit_transform(data)

print("\nBag-of-Words (Normalized count occurrence):")
print(norm_bow_matrix.toarray())


# TF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
tfidf_matrix = tfidf_vectorizer.fit_transform(data)

print("\nTF-IDF:")
print(tfidf_matrix.toarray())


# Word2Vec Embeddings
tokenized_data = [preprocess_text(text) for text in data]
word2vec_model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=1, sg=1)

print("\nWord2Vec Embeddings:")
for word in word2vec_model.wv.key_to_index:
    print(word, ": ", word2vec_model.wv[word])




Bag-of-Words (Count occurrence):
[[0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0]
 [0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0]
 [0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1]]

Bag-of-Words (Normalized count occurrence):
[[0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0]
 [0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0]
 [0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1]]

TF-IDF:
[[0.         0.         0.         0.         0.4472136  0.
  0.         0.         0.         0.4472136  0.         0.4472136
  0.         0.4472136  0.4472136  0.         0.         0.
  0.         0.        ]
 [0.         0.         0.5        0.         0.         0.5
  0.5        0.5        0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.42176478 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.3325242  0.         0.   

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Practical:- Perform text cleaning, perform lemmatization (any method), remove stop words (any method), label encoding. Create representations using TF-IDF. Save outputs.

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import string

# Sample data
data = {
    'text': ["Natural language processing is a field of study.",
             "It involves the interaction between computers and humans.",
             "NLP techniques are used for text analysis and understanding.",
             "Word embeddings are a key component of NLP models."],
    'label': ['NLP', 'NLP', 'NLP', 'NLP']
}

df = pd.DataFrame(data)

df

# Text cleaning, lemmatization, and stop words removal
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)

df

# Label Encoding
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

df

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

df


# Save outputs
df.to_csv('cleaned_data.csv', index=False)
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out()).to_csv('tfidf_representation.csv', index=False)

print("Outputs saved successfully.")




  (0, 14)	0.4472135954999579
  (0, 4)	0.4472135954999579
  (0, 13)	0.4472135954999579
  (0, 9)	0.4472135954999579
  (0, 11)	0.4472135954999579
  (1, 5)	0.5
  (1, 2)	0.5
  (1, 6)	0.5
  (1, 7)	0.5
  (2, 17)	0.4217647821447532
  (2, 0)	0.4217647821447532
  (2, 16)	0.4217647821447532
  (2, 18)	0.4217647821447532
  (2, 15)	0.4217647821447532
  (2, 12)	0.3325241986862672
  (3, 10)	0.4217647821447532
  (3, 1)	0.4217647821447532
  (3, 8)	0.4217647821447532
  (3, 3)	0.4217647821447532
  (3, 19)	0.4217647821447532
  (3, 12)	0.3325241986862672
Outputs saved successfully.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Practical:- Morphology is the study of the way words are built up from smaller meaning bearing units. Study and understand the concepts of morphology by the use of add delete table

# Define a sample set of words
words = {
    "play": ["playing", "played"],
    "happy": ["happiness", "unhappy"],
    "connect": ["connection", "disconnect"]
}

# Function to generate add-delete table
def generate_add_delete_table(words):
    add_delete_table = {}

    # Iterate through each word in the dictionary
    for root_word, variations in words.items():
        add_delete_table[root_word] = {}

        # Iterate through each variation of the word
        for variation in variations:
            add_delete_table[root_word][variation] = {}

            # Add morphemes
            for add_morpheme in set(variation.split(root_word)):
                add_delete_table[root_word][variation][add_morpheme] = root_word + add_morpheme

            # Delete morphemes
            for i in range(len(root_word)):
                delete_morpheme = root_word[i]
                new_variation = variation.replace(delete_morpheme, "", 1)  # Delete one occurrence of the morpheme
                add_delete_table[root_word][variation][delete_morpheme] = new_variation

    return add_delete_table

# Generate and print the add-delete table
add_delete_table = generate_add_delete_table(words)

# Print the table
for root_word, variations in add_delete_table.items():
    print(f"Word: {root_word}")
    for variation, morphemes in variations.items():
        print(f"  Variation: {variation}")
        for operation, result in morphemes.items():
            print(f"    {operation}: {result}")



Word: play
  Variation: playing
    : play
    ing: playing
    p: laying
    l: paying
    a: plying
    y: plaing
  Variation: played
    : play
    ed: played
    p: layed
    l: payed
    a: plyed
    y: plaed
Word: happy
  Variation: happiness
    happiness: happyhappiness
    h: appiness
    a: hppiness
    p: hapiness
    y: happiness
  Variation: unhappy
    un: happyun
    : happy
    h: unappy
    a: unhppy
    p: unhapy
    y: unhapp
Word: connect
  Variation: connection
    : connect
    ion: connection
    c: onnection
    o: cnnection
    n: conection
    e: connction
    t: connecion
  Variation: disconnect
    : connect
    dis: connectdis
    c: disonnect
    o: discnnect
    n: disconect
    e: disconnct
    t: disconnec
