<a href="https://colab.research.google.com/github/Ridhi004/UCS420/blob/main/CC_Assignment10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#q1
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

# Original paragraph
text = """Technology is evolving rapidly and reshaping every aspect of human life. From artificial intelligence to robotics, modern innovations are streamlining tasks and improving efficiency. Smartphones and cloud computing have changed the way people communicate and work. As we move forward, ethical use of technology becomes increasingly important. Education, healthcare, and transportation are just a few fields being transformed by digital solutions. The pace of change demands that we adapt and learn continuously."""

# Lowercase and remove punctuation
text_cleaned = re.sub(r'[^\w\s]', '', text.lower())

# Tokenize into sentences and words
sentences = sent_tokenize(text_cleaned)
words = word_tokenize(text_cleaned)

# Split and compare
split_words = text_cleaned.split()
print("Using split():", split_words[:10])
print("Using word_tokenize():", words[:10])

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

# Word frequency distribution
word_freq = Counter(filtered_words)
print("Word Frequency (Excluding Stopwords):", word_freq.most_common(10))


In [None]:
#2
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')

# Extract words with only alphabets
alpha_words = re.findall(r'\b[a-zA-Z]+\b', text_cleaned)

# Remove stopwords again from alpha words
alpha_filtered = [word for word in alpha_words if word not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in alpha_filtered]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in alpha_filtered]

# Display comparison
print("Original (filtered):", alpha_filtered[:10])
print("Stemmed:", stemmed_words[:10])
print("Lemmatized:", lemmatized_words[:10])


In [None]:
#3

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

texts = [
    "The camera quality of this phone is excellent and the battery lasts long.",
    "I love the design of the laptop, but the performance could be better.",
    "The sound quality of these headphones is amazing and very clear."
]

# Bag of Words
cv = CountVectorizer(stop_words='english')
bow = cv.fit_transform(texts)

# TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(texts)

# Print top 3 TF-IDF keywords for each text
feature_names = np.array(tfidf.get_feature_names_out())
for i in range(len(texts)):
    row = tfidf_matrix[i].toarray().flatten()
    top3 = row.argsort()[::-1][:3]
    print(f"Top 3 keywords for text {i+1}:", feature_names[top3])


In [None]:
#q4

text1 = "Artificial Intelligence enables machines to learn from data and make decisions."
text2 = "Blockchain is a decentralized technology for secure and transparent transactions."

# Preprocessing
tokens1 = set(word_tokenize(text1.lower())) - stop_words
tokens2 = set(word_tokenize(text2.lower())) - stop_words

# Jaccard Similarity
intersection = tokens1.intersection(tokens2)
union = tokens1.union(tokens2)
jaccard_similarity = len(intersection) / len(union)

# Cosine Similarity
tfidf_sim = TfidfVectorizer(stop_words='english')
tfidf_mat = tfidf_sim.fit_transform([text1, text2])
cos_sim = cosine_similarity(tfidf_mat[0], tfidf_mat[1])[0][0]

print("Jaccard Similarity:", jaccard_similarity)
print("Cosine Similarity:", cos_sim)


In [None]:
#q5

from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Sample reviews
reviews = [
    "This product exceeded my expectations. The build quality is superb!",
    "Terrible customer service. I'm very disappointed.",
    "It's okay. Does the job, but nothing exceptional."
]

# Analyze sentiment
for review in reviews:
    blob = TextBlob(review)
    print(f"Review: {review}")
    print(f"Polarity: {blob.sentiment.polarity}, Subjectivity: {blob.sentiment.subjectivity}")
    sentiment = 'Positive' if blob.sentiment.polarity > 0.1 else 'Negative' if blob.sentiment.polarity < -0.1 else 'Neutral'
    print(f"Sentiment: {sentiment}\n")

# Word cloud for positive reviews
positive_reviews = " ".join([r for r in reviews if TextBlob(r).sentiment.polarity > 0.1])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud for Positive Reviews")
plt.show()


In [None]:
#q6
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np

# Sample training paragraph
training_text = """Artificial intelligence is transforming the world. It is being used in healthcare, finance, transportation, and education. AI can analyze vast amounts of data quickly. It helps in making smarter decisions. The future with AI looks promising and full of opportunities."""

# Tokenize and prepare sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts([training_text])
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
words = training_text.split()
for i in range(1, len(words)):
    n_gram_seq = tokenizer.texts_to_sequences([" ".join(words[:i+1])])[0]
    input_sequences.append(n_gram_seq)

# Pad sequences
input_sequences = pad_sequences(input_sequences, padding='pre')
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
ys = keras.utils.to_categorical(labels, num_classes=total_words)

# Build LSTM model
model = Sequential([
    Embedding(total_words, 10, input_length=xs.shape[1]),
    LSTM(50),
    Dense(total_words, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(xs, ys, epochs=200, verbose=0)

# Text generation
seed_text = "AI is"
next_words = 5
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=xs.shape[1], padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print("Generated Text:", seed_text)
