# NLP Assignment - Advanced Text Processing

## Q1. Text Cleaning and Tokenization

In [None]:
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

text = """Food has always fascinated me, not just as a necessity but as an art form. The variety of flavors, textures, and ingredients used across cultures is amazing. I love exploring new cuisines, especially street food which tells a story of its region. Cooking at home has become a creative outlet and a form of relaxation. Food also brings people together, whether it’s a family dinner or a festive feast. I believe food is not just fuel but a universal language."""

cleaned = re.sub(r'[^\w\s]', '', text.lower())
words_split = cleaned.split()
words_token = word_tokenize(cleaned)
sentences = sent_tokenize(text)

stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words_token if w not in stop_words]

word_freq = Counter(filtered_words)

print("Original Sentences:", sentences)
print("Split() Words:", words_split)
print("word_tokenize() Words:", words_token)
print("Stopword Removed:", filtered_words)
print("Word Frequencies:", word_freq)


## Q2. Regex, Stemming & Lemmatization

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

words_alpha = re.findall(r'\b[a-zA-Z]+\b', cleaned)
filtered_alpha = [w for w in words_alpha if w not in stop_words]

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stems = [stemmer.stem(w) for w in filtered_alpha]
lemmas = [lemmatizer.lemmatize(w) for w in filtered_alpha]

print("Alphabet Words:", words_alpha)
print("Filtered Words:", filtered_alpha)
print("Stemming:", stems)
print("Lemmatization:", lemmas)


## Q3. Bag of Words and TF-IDF with Short Texts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

texts = [
    "Apple unveils new iPhone with advanced AI features.",
    "Users praise battery life in the latest Android phones.",
    "Camera quality in smartphones continues to improve yearly."
]

count_vec = CountVectorizer()
count_matrix = count_vec.fit_transform(texts)

tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(texts)

feature_names = tfidf_vec.get_feature_names_out()

for i, text in enumerate(texts):
    print(f"Top 3 TF-IDF keywords for Text {i+1}:")
    row = tfidf_matrix[i].toarray()[0]
    top_indices = row.argsort()[-3:][::-1]
    for idx in top_indices:
        print(f"{feature_names[idx]}: {row[idx]:.3f}")
    print()


## Q4. Similarity Between Technologies

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

tech1 = """Artificial Intelligence is revolutionizing industries by automating tasks and making decisions."""
tech2 = """Blockchain ensures secure, decentralized data storage and is changing finance and supply chains."""

tokens1 = set(re.findall(r'\b\w+\b', tech1.lower()))
tokens2 = set(re.findall(r'\b\w+\b', tech2.lower()))

jaccard = len(tokens1 & tokens2) / len(tokens1 | tokens2)
print("Jaccard Similarity:", round(jaccard, 3))

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([tech1, tech2])
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print("Cosine Similarity:", round(cos_sim[0][0], 3))


## Q5. Sentiment Analysis and WordCloud

In [None]:
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

review = "The customer service was excellent and the product quality exceeded my expectations."

blob = TextBlob(review)
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity

sentiment = "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"

print("Polarity:", polarity)
print("Subjectivity:", subjectivity)
print("Sentiment:", sentiment)

if sentiment == "Positive":
    wc = WordCloud(width=400, height=200).generate(review)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()


## Q6. Text Generation with LSTM

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np

train_text = """Technology continues to evolve rapidly, introducing new tools and platforms that reshape the world. 
Every innovation brings unique opportunities and challenges. Developers and engineers are at the forefront of change."""

tokenizer = Tokenizer()
tokenizer.fit_on_texts([train_text])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in train_text.split('.'):
    tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokens)):
        seq = tokens[:i+1]
        input_sequences.append(seq)

max_seq_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len)

X = input_sequences[:, :-1]
y = input_sequences[:, -1]

model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_seq_len-1))
model.add(LSTM(50))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=200, verbose=0)

seed_text = "Technology continues"
for _ in range(3):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1)
    predicted = model.predict(token_list, verbose=0).argmax(axis=1)[0]
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            seed_text += ' ' + word
            break

print("Generated Text:", seed_text)
