In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

**PreProcessing**

In [None]:

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import ne_chunk,pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import CFG

#tokenization
text = "Hello! How are you doing today? I'm learning NLTK."
word_tokens = word_tokenize(text)
print("Word tokens: ",word_tokens)

#get synonyms
synonyms = wordnet.synsets('good')
print(synonyms)
print("Examples : ", synonyms[0].examples())

#stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in word_tokens if word.lower() not in stop_words]
print("Filtered words:",filtered_tokens)

#stemming
ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in filtered_tokens]
print("Stmemed words : ",stemmed_words)

#lemmatization
lemmatizer = WordNetLemmatizer()
print([lemmatizer.lemmatize(word) for word in filtered_tokens])

#named entity recognition
pos_tags = pos_tag(word_tokens)
entities=ne_chunk(pos_tags)
print("Entites :")
print(entities)

#parse tree sentence
grammar = CFG.fromstring("""
S -> NP VP
NP -> Det N
VP -> V PP
PP -> P NP
Det -> 'the'
N -> 'cat' | 'mat'
V -> 'sat'
P -> 'on'
""")
parser = nltk.ChartParser(grammar)
sentence = "the cat sat on the mat".split()
for tree in parser.parse(sentence):
  tree.pretty_print()

Word tokens:  ['Hello', '!', 'How', 'are', 'you', 'doing', 'today', '?', 'I', "'m", 'learning', 'NLTK', '.']
[Synset('good.n.01'), Synset('good.n.02'), Synset('good.n.03'), Synset('commodity.n.01'), Synset('good.a.01'), Synset('full.s.06'), Synset('good.a.03'), Synset('estimable.s.02'), Synset('beneficial.s.01'), Synset('good.s.06'), Synset('good.s.07'), Synset('adept.s.01'), Synset('good.s.09'), Synset('dear.s.02'), Synset('dependable.s.04'), Synset('good.s.12'), Synset('good.s.13'), Synset('effective.s.04'), Synset('good.s.15'), Synset('good.s.16'), Synset('good.s.17'), Synset('good.s.18'), Synset('good.s.19'), Synset('good.s.20'), Synset('good.s.21'), Synset('well.r.01'), Synset('thoroughly.r.02')]
Examples :  ['for your own good', "what's the good of worrying?"]
Filtered words: ['Hello', '!', 'today', '?', "'m", 'learning', 'NLTK', '.']
Stmemed words :  ['hello', '!', 'today', '?', "'m", 'learn', 'nltk', '.']
['Hello', '!', 'today', '?', "'m", 'learning', 'NLTK', '.']
Entites :
(S


t-test


In [None]:
import math
from nltk import word_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

# Function to calculate the t-statistic for bigrams
def t_test_for_bigrams(w1, w2, sample_text, n):
    n1, n2 = sample_text.count(w1), sample_text.count(w2)  # Word frequencies
    O = sample_text.count(f"{w1} {w2}")  # Observed frequency of bigram
    E = (n1 * n2) / n  # Expected frequency of bigram
    return (O - E) / (E / math.sqrt(n)) if E > 0 else 0  # Calculate t-statistic

# Read the text
with open("/content/ab.txt", "r") as file:
    sample_text = file.read()

# Tokenize the text
tokens = word_tokenize(sample_text.lower())
n = len(tokens)  # Total number of tokens

# Find bigram collocations based on PMI (Pointwise Mutual Information)
finder = BigramCollocationFinder.from_words(tokens)
collocations = finder.nbest(BigramAssocMeasures.pmi, 10)

# Get critical value and check for collocations
cv = float(input("Enter critical value: "))
print("Collocations (with t-statistic > critical value):")
for w1, w2 in collocations:
    t_stat = t_test_for_bigrams(w1, w2, sample_text, n)
    if t_stat > cv:  # If t-statistic exceeds critical value, it's a collocation
        print(f"Collocation: {w1} {w2}")
    else:
      print("Not a collocation")

FileNotFoundError: [Errno 2] No such file or directory: '/content/ab.txt'

chi2

In [None]:
import math
from nltk import word_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from scipy.stats import chi2_contingency

# Function to calculate the Chi-Square statistic for bigrams
def chi_square_for_bigrams(w1, w2, sample_text, n):
    n1, n2 = sample_text.count(w1), sample_text.count(w2)  # Word frequencies
    O = sample_text.count(f"{w1} {w2}")  # Observed frequency of bigram
    E = (n1 * n2) / n  # Expected frequency of bigram

    # Chi-Square formula: Chi2 = sum((O - E)^2 / E)
    chi_square = ((O - E) ** 2) / E if E > 0 else 0
    return chi_square  # Return the calculated Chi-Square value

# Read the text
with open("/content/ab.txt", "r") as file:
    sample_text = file.read()

# Tokenize the text
tokens = word_tokenize(sample_text.lower())
n = len(tokens)  # Total number of tokens

# Find bigram collocations based on PMI (Pointwise Mutual Information)
finder = BigramCollocationFinder.from_words(tokens)
collocations = finder.nbest(BigramAssocMeasures.pmi, 10)

# Get critical value and check for collocations
cv = float(input("Enter critical value: "))

for w1, w2 in collocations:
    chi_square_stat = chi_square_for_bigrams(w1, w2, sample_text, n)
    if chi_square_stat > cv:  # If Chi-Square statistic exceeds critical value, it's a collocation
        print(f"Collocation: {w1} {w2}")


Enter critical value: 2
Collocation: 'd tell
Collocation: an expert
Collocation: another city
Collocation: balance between
Collocation: barista named
Collocation: bean origins
Collocation: began noticing
Collocation: between readability
Collocation: can enjoy
Collocation: care package


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import math

count=int(input("Enter the number of varients senses: "))
sense_word_pair={}
sense_count={}
while(count):
  sense= input("Sense: ")
  sense_word_pair[sense]=[]
  n=int(input(f"enter nuber of sentences assigned-{sense} :"))
  sense_count[sense]=n
  while(n):
    sentence=input("enter the sentence: ")
    tokenized_sentence=word_tokenize(sentence)
    stopword=set(stopwords.words("english"))
    for j in tokenized_sentence:
      if j not in stopword and j not in [",",".","(",")","?"] and j.isalpha():
        sense_word_pair[sense].append(j)
    n-=1
  count-=1
print(sense_word_pair.items(),end="\n\n")
print(sense_count.items())
wordpair_count={}
for sense,word in sense_word_pair.items():
  counter=Counter(word)
  wordpair_count[sense]=counter
print(wordpair_count.items())
vocabulary=0
for i in wordpair_count:
  vocabulary+=len(wordpair_count[i])
print(vocabulary)
wordpair_probability={}
for sense,words in wordpair_count.items():
  for word,count in words.items():
    wordpair_probability[(word,sense)]=(wordpair_count[sense][word]+1)/(vocabulary+sense_count[sense])
print(wordpair_probability.items())
test_sentence=input("Enter sentence to find sense of it")
sentence_token=word_tokenize(test_sentence)
removed_stopword=[]
for j in sentence_token:
  if j not in stopword and j not in [",",".","(",")","?"] and j.isalpha():
    removed_stopword.append(j)
test_probability={}
for sense in sense_count:
  prob_value=math.log2(sense_count[sense]/sum(sense_count.values()))
  for word in removed_stopword:
    if (word,sense) not in wordpair_probability.keys():
      prob_value+=math.log2(1/(vocabulary+sense_count[sense]))
    else:
      prob_value+=math.log2(wordpair_probability[(word,sense)])
  test_probability[prob_value]=sense
print(f"Sense of \"{test_sentence}\"is",test_probabili2ty[max(test_probability.keys())])


Enter the number of varients senses: 2
Sense: dog
enter nuber of sentences assigned-dog :2
enter the sentence: the dog barks at stranger
enter the sentence: dog wags its tail
Sense: cat
enter nuber of sentences assigned-cat :2
enter the sentence: cat scratches the stranger face
enter the sentence: cat drinks milk
dict_items([('dog', ['dog', 'barks', 'stranger', 'dog', 'wags', 'tail']), ('cat', ['cat', 'scratches', 'stranger', 'face', 'cat', 'drinks', 'milk'])])

dict_items([('dog', 2), ('cat', 2)])
dict_items([('dog', Counter({'dog': 2, 'barks': 1, 'stranger': 1, 'wags': 1, 'tail': 1})), ('cat', Counter({'cat': 2, 'scratches': 1, 'stranger': 1, 'face': 1, 'drinks': 1, 'milk': 1}))])
11
dict_items([(('dog', 'dog'), 0.23076923076923078), (('barks', 'dog'), 0.15384615384615385), (('stranger', 'dog'), 0.15384615384615385), (('wags', 'dog'), 0.15384615384615385), (('tail', 'dog'), 0.15384615384615385), (('cat', 'cat'), 0.23076923076923078), (('scratches', 'cat'), 0.15384615384615385), (('st

Naive Bayes


In [None]:
from collections import defaultdict
import math

# Sample labeled sentences
data = [
    ("He went to the bank to deposit money", "financial"),
    ("The bank loaned him money to buy a house", "financial"),
    ("She saved her money in the bank account", "financial"),
    ("He stood by the river bank and watched the water flow", "river"),
    ("They went fishing by the bank of the river", "river"),
    ("The river bank was muddy after the rain", "river"),
]

# Frequency counts for words and classes
word_counts = defaultdict(lambda: defaultdict(int))
class_counts = defaultdict(int)
for sentence, label in data:
    class_counts[label] += 1
    for word in sentence.lower().split():
        word_counts[label][word] += 1

# Classify function using Naive Bayes with Laplace smoothing
def classify(sentence):
    words = sentence.lower().split()
    scores = {}
    for label in class_counts:
        log_prob = math.log(class_counts[label] / len(data))  # prior
        for word in words:
            word_prob = (word_counts[label][word] + 1) / (sum(word_counts[label].values()) + len(word_counts[label]))
            log_prob += math.log(word_prob)
        scores[label] = log_prob
    return max(scores, key=scores.get)

# Testing the function
test_sentence = "The bank is by the river and full of fish"
print("Predicted sense:", classify(test_sentence))


Predicted sense: river


hindle and rooth

In [None]:
from collections import defaultdict

def train(corpus):
  counts=defaultdict(int)
  for sentence in corpus:
    for i in range(len(sentence)-2):
      counts[(sentence[i].lower(),sentence[i+1].lower(),sentence[i+2].lower())]+=1
  return counts
def hindle_and_rooth(word1, preposition, word2,counts):
  trigram1 =(word1.lower(), preposition.lower(), word2.lower())
  trigram2 = (word2.lower(), preposition.lower(), word1.lower())
  if trigram1 in counts:
    return "Preposition "+preposition+" attached with "+word1
  elif trigram2 in counts:
    return "Preposition "+preposition+" attached with "+ word2
  else:
    return "No attachment"
corpus=[['The','girl','with','the','curly','hairs','is','eating','at','home']]
word1="eating"
word2="home"
preposition="at"
counts=train(corpus)
print(hindle_and_rooth(word1,preposition,word2,counts))

Preposition at attached with eating


HMM

In [None]:
import numpy as np

def forward_backward(observations, states, initial_probs, transition_matrix, emission_matrix):
    T = len(observations)
    N = len(states)

    # Forward algorithm
    alpha = np.zeros((T, N))
    alpha[0] = initial_probs# * emission_matrix[:, observations[0]]
    print(alpha[0])
    for t in range(1, T):
        for j in range(N):
            alpha[t, j] = np.sum(alpha[t-1] * transition_matrix[:, j] * emission_matrix[j, observations[t]])

    # Backward algorithm
    beta = np.zeros((T, N))
    beta[T-1] = np.ones(N)
    for t in range(T-2, -1, -1):
        for i in range(N):
            beta[t, i] = np.sum(transition_matrix[i] * emission_matrix[:, observations[t+1]] * beta[t+1])

    return alpha, beta

# Example usage:
states = ['Sunny', 'Rainy']
observations = ["walk", "clean", "shop"]
initial_probs = np.array([0.5, 0.5])
transition_matrix = np.array([[0.7, 0.3],[0.4, 0.6]])
emission_matrix = np.array([[0.1, 0.4, 0.5],
                            [0.6, 0.3, 0.1]])

obs_indices = [observations.index(obs) for obs in observations]
alpha, beta = forward_backward(obs_indices, states, initial_probs, transition_matrix, emission_matrix)

print("Forward (Alpha) probabilities:\n", alpha)
print("Backward (Beta) probabilities:\n", beta)

[0.5 0.5]
Forward (Alpha) probabilities:
 [[0.5    0.5   ]
 [0.22   0.135 ]
 [0.104  0.0147]]
Backward (Beta) probabilities:
 [[0.1298 0.1076]
 [0.38   0.26  ]
 [1.     1.    ]]


Viterbi HMM

In [None]:
import numpy as np

def viterbi(pi, transition_matrix, emission_matrix, observations):
    T = len(observations)
    N = len(pi)

    # Initialization
    viterbi = np.zeros((T, N))
    backpointer = np.zeros((T, N), dtype=int)
    viterbi[0] = pi #* emission_matrix[:, observations[0]]
    backpointer[0] = np.arange(N)
    print(backpointer[0])
    # Recursion
    for t in range(1, T):
        for j in range(N):
            viterbi[t, j] = np.max(viterbi[t-1] * transition_matrix[:, j] * emission_matrix[j, observations[t]])
            backpointer[t, j] = np.argmax(viterbi[t-1] * transition_matrix[:, j] * emission_matrix[j, observations[t]])

    # Termination
    best_path_prob = np.max(viterbi[T-1])
    best_path_pointer = np.argmax(viterbi[T-1])

    # Backtracking
    best_path = []
    for t in range(T-1, -1, -1):
        best_path.append(best_path_pointer)
        best_path_pointer = backpointer[t, best_path_pointer]
    best_path.reverse()

    return best_path_prob, best_path

# Example usage
pi = np.array([0.6, 0.4])
transition_matrix = np.array([[0.7, 0.3], [0.4, 0.6]])
emission_matrix = np.array([[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]])
observations = [2, 1, 0]  # Index-based representation of observations

probability, path = viterbi(pi, transition_matrix, emission_matrix, observations)
print("Probability of the most likely sequence:", probability)
print("Most likely sequence of states:", path)

[0 1]
Probability of the most likely sequence: 0.03024
Most likely sequence of states: [0, 0, 1]


PCFG

In [None]:
from nltk import PCFG
import nltk

pcfg_grammar = PCFG.fromstring("""
S -> NP VP [0.7]
S -> N VP [0.3]
PP -> P NP [1.0]
NP -> Det N [0.3] | Det N PP [0.6] | 'I' [0.1]
VP -> V NP [0.25] | VP PP [0.25] | V NP [0.25] | V PP [0.25]
Det -> 'an' [0.4] | 'my' [0.3] | 'a' [0.3]
N -> 'elephant' [0.4] | 'pajamas' [0.1] | 'I' [0.5]
V -> 'shot' [0.3] | 'am' [0.2] | 'is' [0.5]
P -> 'in' [0.5] | 'of' [0.5]
""")

sentence = ['I', 'shot', 'an', 'elephant']
parser = nltk.ChartParser(pcfg_grammar)

def calculate_tree_probability(tree, grammar):
    probability = 1.0
    for prod in tree.productions():
        for rule in grammar.productions():
            if rule.lhs() == prod.lhs() and rule.rhs() == prod.rhs():
                probability *= rule.prob()
                break
    return probability

parses = list(parser.parse(sentence))

i = 0
for tree in parses:
    print(f"Parse Tree {i+1}:")
    i += 1
    print(tree)
    prob = calculate_tree_probability(tree, pcfg_grammar)
    print(f"Probability of this tree: {prob}\n")

total_prob = sum(calculate_tree_probability(tree, pcfg_grammar) for tree in parses)
print(f"Total probability of all trees: {total_prob}")

Parse Tree 1:
(S (N I) (VP (V shot) (NP (Det an) (N elephant))))
Probability of this tree: 0.00054

Parse Tree 2:
(S (NP I) (VP (V shot) (NP (Det an) (N elephant))))
Probability of this tree: 0.000252

Total probability of all trees: 0.000792


In [None]:
from nltk import PCFG, ViterbiParser
import nltk

# Define the PCFG grammar
pcfg_grammar = PCFG.fromstring("""
S -> NP VP [0.7]
S -> N VP [0.3]
PP -> P NP [1.0]
NP -> Det N [0.3] | Det N PP [0.6] | 'I' [0.1]
VP -> V NP [0.25] | VP PP [0.25] | V NP [0.25] | V PP [0.25]
Det -> 'an' [0.4] | 'my' [0.3] | 'a' [0.3]
N -> 'elephant' [0.4] | 'pajamas' [0.1] | 'I' [0.5]
V -> 'shot' [0.3] | 'am' [0.2] | 'is' [0.5]
P -> 'in' [0.5] | 'of' [0.5]
""")

# Define the sentence to parse
sentence = ['I', 'shot', 'an', 'elephant']

# Initialize the Viterbi parser with the PCFG grammar
viterbi_parser = ViterbiParser(pcfg_grammar)

# Parse the sentence to find the most probable parse
parses = list(viterbi_parser.parse(sentence))

# If there is a parse, print the most probable one and its probability
if parses:
    best_tree = parses[0]
    print("Most probable parse tree:")
    print(best_tree)
    print(f"Probability of the most probable parse tree: {best_tree.prob()}")
else:
    print("No parse found.")


Most probable parse tree:
(S (N I) (VP (V shot) (NP (Det an) (N elephant)))) (p=0.00054)
Probability of the most probable parse tree: 0.00054


Word2vec

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer

# Sample text data
text = "This is a sample text for word2vec. We will use this text to train our model."

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index

# Model (Skip-Gram)
model = Sequential([
    Embedding(len(word_index) + 1, 100, input_length=1),
    Flatten(),
    Dense(len(word_index) + 1, activation='softmax')
])
model.compile('adam', 'sparse_categorical_crossentropy')

# Training data
window_size = 2
data = [(i, j) for i, word in enumerate(word_index)
        for j in range(max(0, i - window_size), min(len(word_index), i + window_size + 1))
        if i != j]

# Train the model
x_train, y_train = zip(*data)
model.fit(np.array(x_train), np.array(y_train), epochs=5, verbose=1)

# Find similar words
def find_similar_words(word):
    idx = tokenizer.word_index[word]
    word_vec = model.layers[0].get_weights()[0][idx - 1]  # Corrected index for 1-based indexing
    similarities = np.dot(model.layers[0].get_weights()[0], word_vec) / np.linalg.norm(model.layers[0].get_weights()[0], axis=1)
    return [tokenizer.index_word[i + 1] for i in np.argsort(similarities)[::-1][1:6]]  # Corrected to ensure 1-based index access

print(find_similar_words('text'))


Epoch 1/5




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 2.7166
Epoch 2/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2.7034 
Epoch 3/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2.6950 
Epoch 4/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.6850 
Epoch 5/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.6765 
['our', 'this', 'sample', 'use', 'word2vec']


In [None]:
#Implement word2vec model to explore the semantic similarity between the words.
import gensim
from gensim.models import Word2Vec
import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
#sentences = ['I love programming', 'Python is great', 'I enjoy machine learning',
            # 'TensorFlow is a powerful tool', 'AI is the future']
with open('/content/ab.txt', 'r') as f:
  sentences=f.read()
sentences = sent_tokenize(sentences)  # Tokenize the text into sentences
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]  # Tokenize each sentence


#print(tokenized_sentences)
model_w2v = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

#  word vectors (semantic similarity)
word = 'strong'
similar_words = model_w2v.wv.most_similar(word, topn=5)
print(f"Words most similar to '{word}':")
for sim_word, sim_score in similar_words:
    print(f"{sim_word}: {sim_score:.4f}")
#model_w2v.save("word2vec_model.bin")
#loaded_model = Word2Vec.load("word2vec_model.bin")

Words most similar to 'strong':
formatted: 0.3225
and: 0.3115
colleagues: 0.2919
their: 0.2623
a: 0.2595


tf-idf bow


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from tensorflow import *
from tensorflow.keras import *
from tensorflow.keras.models import *
data = {'text':['i love programming', 'pythhon is interesting', 'i enjoy machine learning'],
        'label':['positive', 'positive','neutral']}
df = pd.DataFrame(data)
df['label']=LabelEncoder().fit_transform(df['label'])
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2,random_state=42)
def vectorize(vectorizer):
  X_train_vec = vectorizer.fit_transform(X_train).toarray()
  X_test_vec = vectorizer.transform(X_test).toarray()
  return X_train_vec, X_test_vec
def build(X_train_vec, X_test_vec):
    model = Sequential([Dense(16, activation='relu', input_dim=X_train_vec.shape[1]),
                        Dense(8, activation='relu'),
                        Dense(1, activation='sigmoid')])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_vec, y_train, epochs=10, batch_size=2, verbose=1)
    return model.evaluate(X_test_vec, y_test)[1]
acc_bow = build(*vectorize(CountVectorizer()))
acc_tfidf = build(*vectorize(TfidfVectorizer()))
print("bow: ",acc_bow)
print("tfidf: ",acc_tfidf)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.5000 - loss: 0.6851
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5000 - loss: 0.6790
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.5000 - loss: 0.6730
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.5000 - loss: 0.6670
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.5000 - loss: 0.6610
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.5000 - loss: 0.6551
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.5000 - loss: 0.6493
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.5000 - loss: 0.6435
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step - accuracy: 0.0000e+00 - loss: 0.7013
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 1.0000 - loss: 0.6341
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 1.0000 - loss: 0.6300
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 1.0000 - loss: 0.6262
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 1.0000 - loss: 0.6222
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 1.0000 - loss: 0.6182
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 1.0000 - loss: 0.6143
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 1.0000 - loss: 0.6104
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.preprocessing import *
from tensorflow.keras import *
from tensorflow.keras.layers import *
data = {'text': ['I love programming', 'Python is great', 'I enjoy machine learning',
                 'TensorFlow is a powerful tool', 'AI is the future'],
        'label': ['positive', 'positive', 'positive', 'positive', 'neutral']}
df = pd.DataFrame(data)
df['label'] = LabelEncoder().fit_transform(df['label'])
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
def vectorize_data(vectorizer):
    X_train_vec = vectorizer.fit_transform(X_train).toarray()
    X_test_vec = vectorizer.transform(X_test).toarray()
    return X_train_vec, X_test_vec

# Build and train model
def build_and_train_model(X_train_vec, X_test_vec):
    model = Sequential([Dense(16, activation='relu', input_dim=X_train_vec.shape[1]),
                        Dense(8, activation='relu'),
                        Dense(1, activation='sigmoid')])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_vec, y_train, epochs=10, batch_size=2, verbose=1)
    return model.evaluate(X_test_vec, y_test)[1]

# Evaluate models
accuracy_bow = build_and_train_model(*vectorize_data(CountVectorizer()))
print(f'BoW Model Accuracy: {accuracy_bow:.2f}')

accuracy_tfidf = build_and_train_model(*vectorize_data(TfidfVectorizer()))
print(f'TF-IDF Model Accuracy: {accuracy_tfidf:.2f}')


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6667 - loss: 0.6387
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8333 - loss: 0.6310 
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6667 - loss: 0.6574 
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8333 - loss: 0.6409 
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8333 - loss: 0.6361 
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8333 - loss: 0.6237 
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8333 - loss: 0.6179 
Epoch 8/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6667 - loss: 0.6157 
Epoch 9/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step - accuracy: 0.0000e+00 - loss: 0.7259
TF-IDF Model Accuracy: 0.00


**Bag Of Words**

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Sample data
texts = ["This is a positive review", "I enjoyed the movie a lot", "Great performance by the actors",
         "The plot was intriguing", "Negative feedback about the service", "Disappointed with the product quality",
         "Worst experience ever"]
labels = [1, 1, 1, 1, 0, 0, 0]

# Vectorize the text
X = CountVectorizer().fit_transform(texts).toarray()
y = np.array(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build, compile and train the model
model = tf.keras.Sequential([tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(X_train.shape[1],))])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=2)

# Evaluate and predict
accuracy = model.evaluate(X_test, y_test)[1]
predictions = (model.predict(CountVectorizer().fit(texts).transform(["I loved the movie", "Worst product ever"]).toarray()) > 0.5).astype(int)

print(f"Test Accuracy: {accuracy * 100:.2f}%")
print("Predictions:", predictions)


Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8375 - loss: 0.6217  
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8375 - loss: 0.6287 
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7125 - loss: 0.6610 
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7125 - loss: 0.6479 
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8375 - loss: 0.6096 
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8375 - loss: 0.6071 
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8375 - loss: 0.5985 
Epoch 8/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8375 - loss: 0.6108 
Epoch 9/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

TFIDF

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Sample data
texts = [
    "This is a positive review", "I enjoyed the movie a lot", "Great performance by the actors",
    "The plot was intriguing", "Negative feedback about the service",
    "Disappointed with the product quality", "Worst experience ever"
]
labels = [1, 1, 1, 1, 0, 0, 0]

# TF-IDF Vectorization and data split
X = TfidfVectorizer().fit_transform(texts).toarray()
y = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model creation, compilation, and training
model = tf.keras.Sequential([tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(X_train.shape[1],))])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=2)

# Evaluation and prediction
accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
predictions = (model.predict(TfidfVectorizer().fit(texts).transform(["I loved the movie", "Worst product ever"]).toarray()) > 0.5).astype(int)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Predictions:", predictions)


Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.4500 - loss: 0.7047  
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2625 - loss: 0.7475     
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4500 - loss: 0.7051 
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5750 - loss: 0.6946 
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4500 - loss: 0.6944 
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2625 - loss: 0.7256     
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4500 - loss: 0.7238 
Epoch 8/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4500 - loss: 0.6926 
Epoch 9/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

**Text Generation using LSTM**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample corpus
corpus = [
    "This is a positive review",
    "I enjoyed the movie a lot",
    "Great performance by the actors",
    "The plot was intriguing",
    "Negative feedback about the service",
    "Disappointed with the product quality",
    "Worst experience ever"
]

# Tokenize and create sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
sequences = [tokenizer.texts_to_sequences([text])[0] for text in corpus]
sequences = [seq[:i+1] for seq in sequences for i in range(1, len(seq))]

# Prepare input (X) and output (y)
X, y = zip(*[(seq[:-1], seq[-1]) for seq in pad_sequences(sequences, maxlen=max(len(seq) for seq in sequences), padding='pre')])
X, y = np.array(X), tf.keras.utils.to_categorical(np.array(y), num_classes=total_words)

# Build and compile model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 10, input_length=X.shape[1]),
    tf.keras.layers.LSTM(50),
    tf.keras.layers.Dense(total_words, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, verbose=0)

# Generate text based on seed
def generate_text(seed_text, n_words=5):
    for _ in range(n_words):
        token_list = pad_sequences([tokenizer.texts_to_sequences([seed_text])[0]], maxlen=X.shape[1], padding='pre')
        seed_text += ' ' + tokenizer.index_word[np.argmax(model.predict(token_list))]
    return seed_text

# Generate and print text
print(generate_text("This is", 5))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
This is the a review lot lot


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data
corpus = [
    "This is a positive review", "I enjoyed the movie a lot",
    "Great performance by the actors", "The plot was intriguing",
    "Negative feedback about the service", "Disappointed with the product quality",
    "Worst experience ever"
]

# Step 1: Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1  # Count of unique words

# Step 2: Create sequences for training
sequences = []
for text in corpus:
    token_list = tokenizer.texts_to_sequences([text])[0]
    for i in range(1, len(token_list)):
        sequences.append(token_list[:i+1])  # Creates incremental sequences

# Step 3: Prepare X (inputs) and y (outputs) for the model
X = pad_sequences([seq[:-1] for seq in sequences])  # Inputs (all except the last word)
y = tf.keras.utils.to_categorical([seq[-1] for seq in sequences], num_classes=total_words)  # Output (last word)

# Step 4: Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 10, input_length=X.shape[1]),  # Embedding layer for word representations
    tf.keras.layers.LSTM(50),  # LSTM layer to learn sequence patterns
    tf.keras.layers.Dense(total_words, activation='softmax')  # Output layer for word prediction
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=0)

# Step 5: Function to generate text
def generate_text(seed_text, num_words=5):
    for _ in range(num_words):
        token_list = pad_sequences([tokenizer.texts_to_sequences([seed_text])[0]], maxlen=X.shape[1])
        predicted_word_index = np.argmax(model.predict(token_list), axis=-1)[0]
        seed_text += ' ' + tokenizer.index_word[predicted_word_index]
    return seed_text

# Example usage
print(generate_text("This is", 5))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
This is the a positive review review


In [None]:
import tensorflow as tf
import numpy as np
from nltk.tokenize import word_tokenize

# Define a basic machine translation dataset
source_texts = ['hello', 'how are you', 'goodbye']
target_texts = ['bonjour', 'comment ça va', 'au revoir']

# Create vocabulary and mapping from words to integers
source_vocab = set(word_tokenize(" ".join(source_texts)))
target_vocab = set(word_tokenize(" ".join(target_texts)))

source_vocab_size = len(source_vocab) + 1  # +1 for padding
target_vocab_size = len(target_vocab) + 1  # +1 for padding

source_word_to_int = {word: idx+1 for idx, word in enumerate(source_vocab)}
target_word_to_int = {word: idx+1 for idx, word in enumerate(target_vocab)}

source_int_to_word = {idx + 1 : word for idx, word in enumerate(source_vocab)}
target_int_to_word = {idx + 1: word for idx, word in enumerate(target_vocab)}


# Convert text sequences to integer sequences
source_sequences = [[source_word_to_int[word] for word in word_tokenize(text)] for text in source_texts]
target_sequences = [[target_word_to_int[word] for word in word_tokenize(text)] for text in target_texts]

# Pad sequences to the same length
max_sequence_length = max(len(seq) for seq in source_sequences)

source_sequences = tf.keras.preprocessing.sequence.pad_sequences(source_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

# Build the model
input_shape = (max_sequence_length, source_vocab_size)
output_shape = (max_sequence_length, target_vocab_size)

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(source_vocab_size, 64, input_length=max_sequence_length),
    tf.keras.layers.SimpleRNN(128, return_sequences=True),
    tf.keras.layers.Dense(target_vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# One-hot encode the target sequences
target_sequences_one_hot = np.array([tf.keras.utils.to_categorical(seq, num_classes=target_vocab_size) for seq in target_sequences])

# Train the model
model.fit(source_sequences, target_sequences_one_hot, epochs=30)

# Translate a new input sequence
input_sequence = "how are you"
input_sequence = [source_word_to_int[word] for word in input_sequence.split()]
input_sequence = tf.keras.preprocessing.sequence.pad_sequences([input_sequence], maxlen=max_sequence_length, padding='post')

output_sequence = model.predict(input_sequence)[0]
# Decode the output sequence
output_sequence = [target_int_to_word[np.argmax(word)] for word in output_sequence if np.argmax(word) != 0]
print("Translated Sequence:", ' '.join(output_sequence))

Epoch 1/30




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.1111 - loss: 1.9331
Epoch 2/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.4444 - loss: 1.8867
Epoch 3/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.5556 - loss: 1.8405
Epoch 4/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.7778 - loss: 1.7938
Epoch 5/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.7778 - loss: 1.7463
Epoch 6/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.7778 - loss: 1.6975
Epoch 7/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.7778 - loss: 1.6469
Epoch 8/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.7778 - loss: 1.5946
Epoch 9/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms

In [None]:
import tensorflow as tf
import numpy as np
from nltk.tokenize import word_tokenize

# Define a basic dataset
source_texts = ['hello', 'how are you', 'goodbye']
target_texts = ['bonjour', 'comment ça va', 'au revoir']

# Create vocabulary mappings
source_vocab = {word: idx+1 for idx, word in enumerate(set(word_tokenize(" ".join(source_texts))))}
target_vocab = {word: idx+1 for idx, word in enumerate(set(word_tokenize(" ".join(target_texts))))}
source_sequences = [[source_vocab[word] for word in word_tokenize(text)] for text in source_texts]
target_sequences = [[target_vocab[word] for word in word_tokenize(text)] for text in target_texts]

# Pad sequences
max_len = max(map(len, source_sequences))
source_sequences = tf.keras.preprocessing.sequence.pad_sequences(source_sequences, maxlen=max_len, padding='post')
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_len, padding='post')
target_sequences_one_hot = np.array([tf.keras.utils.to_categorical(seq, num_classes=len(target_vocab)+1) for seq in target_sequences])

# Model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(source_vocab) + 1, 64, input_length=max_len),
    tf.keras.layers.SimpleRNN(128, return_sequences=True),
    tf.keras.layers.Dense(len(target_vocab) + 1, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(source_sequences, target_sequences_one_hot, epochs=30)

# Translation
input_seq = tf.keras.preprocessing.sequence.pad_sequences(
    [[source_vocab.get(word, 0) for word in word_tokenize("how are you")]], maxlen=max_len, padding='post')
output_seq = model.predict(input_seq)[0]
translated = ' '.join([list(target_vocab.keys())[np.argmax(word)-1] for word in output_seq if np.argmax(word) != 0])
print("Translated Sequence:", translated)


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************
