<a href="https://colab.research.google.com/github/Vishwas02mehta/Vishwas-UCS420-/blob/main/assignment10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Install necessary packages
!pip install nltk scikit-learn wordcloud textblob keras tensorflow > /dev/null

# Download required NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Q1
paragraph = """Technology is evolving at an incredible pace, impacting every aspect of our lives.
From artificial intelligence to smart devices, innovations are improving healthcare, education, and communication.
One of my favorite areas is AI, which powers voice assistants, self-driving cars, and predictive analytics.
Books and blogs on deep learning and data science fascinate me.
Learning about new frameworks and tools keeps me excited and engaged."""

import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

clean_text = re.sub(r'[^\w\s]', '', paragraph.lower())
words_tokenized = word_tokenize(clean_text)
sentences_tokenized = sent_tokenize(paragraph)
split_words = clean_text.split()
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words_tokenized if word not in stop_words]
word_freq = Counter(filtered_words)

print("Original Text:\n", paragraph)
print("\nLowercase & Punctuation Removed:\n", clean_text)
print("\nWord Tokenized:\n", words_tokenized)
print("\nSentence Tokenized:\n", sentences_tokenized)
print("\nSplit Words (Python):\n", split_words)
print("\nFiltered Words (No Stopwords):\n", filtered_words)
print("\nWord Frequency (No Stopwords):\n", word_freq)

# Q2
alpha_words = re.findall(r'\b[a-zA-Z]+\b', clean_text)
alpha_filtered = [word for word in alpha_words if word not in stop_words]

from nltk.stem import PorterStemmer, WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed = [stemmer.stem(w) for w in alpha_filtered]
lemmatized = [lemmatizer.lemmatize(w) for w in alpha_filtered]

print("\nAlphabet-only Words:\n", alpha_words)
print("\nStemmed Words:\n", stemmed)
print("\nLemmatized Words:\n", lemmatized)
print("\nStemming is faster but less accurate. Lemmatization gives actual root words, better for understanding.")

# Q3
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

texts = [
    "The phone battery life is amazing and lasts all day.",
    "The laptop crashed twice during my work.",
    "Great sound quality and comfortable headphones."
]

cv = CountVectorizer()
cv_matrix = cv.fit_transform(texts)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts)
feature_names = tfidf.get_feature_names_out()

import numpy as np

print("\nTop 3 Keywords Per Text using TF-IDF:")
for i, row in enumerate(tfidf_matrix):
    print(f"\nText {i+1}:")
    tfidf_scores = zip(feature_names, row.toarray()[0])
    sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:3]
    for word, score in sorted_scores:
        print(f"{word}: {score:.4f}")

# Q4
tech1 = """Artificial Intelligence is transforming the world. It powers assistants, robots, and analytics systems."""
tech2 = """Blockchain ensures secure and transparent transactions. It is popular in finance and digital identity."""

def preprocess(text):
    return [word for word in word_tokenize(re.sub(r'[^\w\s]', '', text.lower())) if word not in stop_words]

tokens1 = preprocess(tech1)
tokens2 = preprocess(tech2)

set1, set2 = set(tokens1), set(tokens2)
jaccard = len(set1 & set2) / len(set1 | set2)
print("\nJaccard Similarity:", jaccard)

from sklearn.metrics.pairwise import cosine_similarity
vec = TfidfVectorizer().fit_transform([tech1, tech2])
cos_sim = cosine_similarity(vec[0:1], vec[1:2])[0][0]
print("Cosine Similarity:", cos_sim)
print("Cosine similarity gives more insight with weighted context than Jaccard’s pure set match.")

# Q5
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

review = "The product exceeded my expectations. Very smooth, fast, and user-friendly."

blob = TextBlob(review)
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity

sentiment = "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"

print("\nReview:", review)
print("Polarity:", polarity)
print("Subjectivity:", subjectivity)
print("Sentiment:", sentiment)

if sentiment == "Positive":
    wordcloud = WordCloud(background_color='white').generate(review)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

# Q6
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np

train_text = """Machine learning is a field of computer science that gives computers the ability to learn without being explicitly programmed. It is a subset of artificial intelligence. ML is used in various applications like recommendation systems, fraud detection, and self-driving cars."""

tokenizer = Tokenizer()
tokenizer.fit_on_texts([train_text])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
words = train_text.lower().split()
for i in range(1, len(words)):
    ngram_seq = words[:i+1]
    token_list = tokenizer.texts_to_sequences([' '.join(ngram_seq)])[0]
    input_sequences.append(token_list)

max_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')
input_sequences = np.array(input_sequences)
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = np.array(y)

model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_len-1))
model.add(LSTM(50))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=200, verbose=0)

seed_text = "machine"
for _ in range(3):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == np.argmax(predicted):
            output_word = word
            break
    seed_text += " " + output_word
print("\nGenerated Text:", seed_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
