<a href="https://colab.research.google.com/github/PreniAvanessi/Sentiment_analysis_using_Tensorflow/blob/main/untitled40.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Sentiment analysis of IMDB_reviews using Logistic Regerssion


In [None]:
import re
def clean_text(text):
  text=text.lower()
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove punctuation
  return text


train_texts_clean = [clean_text(t) for t in train_texts]
test_texts_clean = [clean_text(t) for t in test_texts]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english'  # removes common stopwords
)

X_train = vectorizer.fit_transform(train_texts_clean)
X_test = vectorizer.transform(test_texts_clean)

print(f"Training matrix shape: {X_train.shape}")
print(f"Test matrix shape: {X_test.shape}")


Training matrix shape: (25000, 20000)
Test matrix shape: (25000, 20000)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=2000)
model.fit(X_train, train_labels)

predictions = model.predict(X_test)
accuracy = accuracy_score(test_labels, predictions)
print(f"Improved Test Accuracy: {accuracy:.4f}")


Improved Test Accuracy: 0.8824


###Sentiment analysis of IMDB reviews using BiLSTM model


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tensorflow.keras.layers import TextVectorization

In [None]:
(train_data,test_data),info=tfds.load("imdb_reviews", split=['train','test'],as_supervised=True,with_info=True)

In [None]:
vocab_size=20000 #“Keep only the 20,000 most frequent words in the dataset.”
sequence_length=200
#Neural networks (LSTM, BiLSTM, CNN) need fixed-size input.
vectorizer = TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=sequence_length,
    standardize="lower_and_strip_punctuation" #"Make everything lowercase and remove punctuation from text before tokenizing."
)

train_text=train_data.map(lambda x, y: x)  #“Give me a dataset that contains only the text, so I can feed it to vectorizer.adapt()
vectorizer.adapt(train_text)

In [None]:
batch_size = 32

def preprocess(text, label):
    text = vectorizer(text)  #Maps each token (word) to its integer index according to the vocabulary learned during .adapt()
    return text, label

train_ds = train_data.map(preprocess).shuffle(10000).batch(batch_size).prefetch(2)
test_ds = test_data.map(preprocess).batch(batch_size).prefetch(2)


In [None]:
model = keras.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=sequence_length),
    layers.Bidirectional(layers.LSTM(64, return_sequences=False)),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(1, activation="sigmoid")
])




In [None]:
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)


In [None]:
history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=5
)


Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 368ms/step - accuracy: 0.6947 - loss: 0.5608 - val_accuracy: 0.8261 - val_loss: 0.3897
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 315ms/step - accuracy: 0.9003 - loss: 0.2651 - val_accuracy: 0.8426 - val_loss: 0.3655
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 349ms/step - accuracy: 0.9457 - loss: 0.1615 - val_accuracy: 0.8404 - val_loss: 0.4897
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 310ms/step - accuracy: 0.9669 - loss: 0.0994 - val_accuracy: 0.8121 - val_loss: 0.6791
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 317ms/step - accuracy: 0.9790 - loss: 0.0625 - val_accuracy: 0.8201 - val_loss: 0.6390


In [None]:
test_loss, test_acc = model.evaluate(test_ds)
print("Test Accuracy:", test_acc)


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 66ms/step - accuracy: 0.8216 - loss: 0.6378
Test Accuracy: 0.8201199769973755
