In [5]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import nltk

nltk.download('punkt')
nltk.download('stopwords')

text = """AI is changing industries. Machine learning and NLP are powerful. Python is great for AI."""


cleaned = re.sub(r'[^\w\s]', '', text.lower())


tokens_split = cleaned.split()
tokens_nltk = word_tokenize(cleaned)


stop_words = set(stopwords.words('english'))
filtered = [word for word in tokens_nltk if word not in stop_words]


freq = Counter(filtered)
print("Word Frequencies:", freq)



Word Frequencies: Counter({'ai': 2, 'changing': 1, 'industries': 1, 'machine': 1, 'learning': 1, 'nlp': 1, 'powerful': 1, 'python': 1, 'great': 1})


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anshikaahuja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anshikaahuja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


words = re.findall(r'\b[a-zA-Z]+\b', cleaned)
filtered_words = [w for w in words if w not in stop_words]

stemmed = [stemmer.stem(w) for w in filtered_words]
print("Stemmed Words:", stemmed)


nltk.download('wordnet')
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_words]
print("Lemmatized Words:", lemmatized)


Stemmed Words: ['ai', 'chang', 'industri', 'machin', 'learn', 'nlp', 'power', 'python', 'great', 'ai']
Lemmatized Words: ['ai', 'changing', 'industry', 'machine', 'learning', 'nlp', 'powerful', 'python', 'great', 'ai']


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anshikaahuja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

texts = [
    "AI is amazing and powerful",
    "Machine learning and NLP are useful",
    "Python is great for machine learning"
]


cv = CountVectorizer()
bow = cv.fit_transform(texts)
print("BoW Feature Names:", cv.get_feature_names_out())
print("BoW Matrix:\n", bow.toarray())


tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts)
print("TF-IDF Feature Names:", tfidf.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


BoW Feature Names: ['ai' 'amazing' 'and' 'are' 'for' 'great' 'is' 'learning' 'machine' 'nlp'
 'powerful' 'python' 'useful']
BoW Matrix:
 [[1 1 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 1 1 0 0 0 1 1 1 0 0 1]
 [0 0 0 0 1 1 1 1 1 0 0 1 0]]
TF-IDF Feature Names: ['ai' 'amazing' 'and' 'are' 'for' 'great' 'is' 'learning' 'machine' 'nlp'
 'powerful' 'python' 'useful']
TF-IDF Matrix:
 [[0.49047908 0.49047908 0.37302199 0.         0.         0.
  0.37302199 0.         0.         0.         0.49047908 0.
  0.        ]
 [0.         0.         0.34949812 0.45954803 0.         0.
  0.         0.34949812 0.34949812 0.45954803 0.         0.
  0.45954803]
 [0.         0.         0.         0.         0.45954803 0.45954803
  0.34949812 0.34949812 0.34949812 0.         0.         0.45954803
  0.        ]]


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

text1 = "machine learning is fun"
text2 = "learning about machine intelligence"


set1 = set(text1.split())
set2 = set(text2.split())
jaccard = len(set1 & set2) / len(set1 | set2)
print("Jaccard Similarity:", jaccard)


vectorizer = TfidfVectorizer()
vecs = vectorizer.fit_transform([text1, text2])
cos_sim = cosine_similarity(vecs[0:1], vecs[1:2])
print("Cosine Similarity:", cos_sim[0][0])


Jaccard Similarity: 0.3333333333333333
Cosine Similarity: 0.3360969272762575


In [17]:
!pip install textblob


Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.3/624.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk, textblob
  Attempting uninstall: nltk
    Found existing installation: nltk 3.8.1
    Uninstalling nltk-3.8.1:
      Successfully uninstalled nltk-3.8.1
Successfully installed nltk-3.9.1 textblob-0.19.0


In [19]:
import nltk
nltk.download('brown')
nltk.download('punkt')
from textblob import TextBlob

review = "The service was excellent and the staff was friendly."
blob = TextBlob(review)
print("Polarity:", blob.sentiment.polarity)
print("Subjectivity:", blob.sentiment.subjectivity)


[nltk_data] Downloading package brown to
[nltk_data]     /Users/anshikaahuja/nltk_data...


Polarity: 0.6875
Subjectivity: 0.75


[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anshikaahuja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
!pip install keras




In [30]:
!pip install tensorflow




In [49]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
import tensorflow as tf


text = "Machine learning is fun and exciting to learn"


tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index
total_words = len(word_index) + 1


words = text.split()
sequences = []
for i in range(1, len(words)):
    seq = words[:i+1]
    tokenized_seq = tokenizer.texts_to_sequences([' '.join(seq)])[0]
    sequences.append(tokenized_seq)


padded = pad_sequences(sequences)
X, y = padded[:, :-1], padded[:, -1]


model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=X.shape[1]))
model.add(LSTM(50))
model.add(Dense(50, activation='relu'))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')


model.fit(X, y, epochs=200, verbose=0)


seed_text = "Machine learning"
next_words = 3

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=X.shape[1])
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
    
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            seed_text += ' ' + word
            break

print("Generated Text:", seed_text)


Generated Text: Machine learning is fun and
