In [11]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [12]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec


In [13]:
# Sample Data (replace with your actual data)
texts = [
    "This is a positive review about the excellent product.",
    "The movie was absolutely terrible and boring.",
    "I really enjoyed the fantastic service.",
    "This is the worst experience I've ever had.",
    "The food was delicious and the staff was friendly.",
    "A completely waste of time and money.",
]
labels = np.array([1, 0, 1, 0, 1, 0])  # 1 for positive, 0 for negative


In [14]:
# 1. Tokenization and Vocabulary Building
tokenizer = Tokenizer(num_words=100)  # Consider a larger vocabulary size
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [15]:
# 2. Sequence Conversion and Padding
sequences = tokenizer.texts_to_sequences(texts)
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

"""
# 3. Word2Vec Embedding (Simplified Example - In a real scenario, you'd load pre-trained or train your own)
embedding_dim = 100  # Dimension of the word embeddings
embedding_matrix = np.random.rand(vocab_size, embedding_dim) # Replace with actual Word2Vec embeddings
"""

"\n# 3. Word2Vec Embedding (Simplified Example - In a real scenario, you'd load pre-trained or train your own)\nembedding_dim = 100  # Dimension of the word embeddings\nembedding_matrix = np.random.rand(vocab_size, embedding_dim) # Replace with actual Word2Vec embeddings\n"

In [2]:
import gensim.downloader as api
word2vec_model = api.load('word2vec-google-news-300') # Example: loads Google News model
word2vec_model.save("word2vec-google-news-300.model") # Save it locally



In [6]:
from gensim.models import Word2Vec

In [9]:
#word2vec_model = Word2Vec.load("/content/word2vec-google-news-300.model")
# Load the saved KeyedVectors model
from gensim.models import Word2Vec, KeyedVectors
# Load the saved KeyedVectors model
word2vec_model = KeyedVectors.load("/content/word2vec-google-news-300.model", mmap='r')

In [16]:
# 3. Word2Vec Embedding using Gensim

# Option 1: Load a pre-trained Word2Vec model (replace with your model path)
try:
    #word2vec_model = Word2Vec.load("path/to/your/pretrained_word2vec.model")
    embedding_dim = word2vec_model.vector_size
except FileNotFoundError:
    print("Pre-trained Word2Vec model not found. Please train one or provide the correct path.")
    exit()

# Option 2: Train your own Word2Vec model (uncomment and adjust parameters)
# tokenized_texts = [text.split() for text in texts] # Needs more data for meaningful training
# embedding_dim = 100
# word2vec_model = Word2Vec(tokenized_texts, vector_size=embedding_dim, window=5, min_count=1, workers=4)
# word2vec_model.save("your_trained_word2vec.model") # Save the trained model


In [18]:
# Create the embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    # Check if the word exists directly in the KeyedVectors object
    if word in word2vec_model:
        # Access the vector directly from the KeyedVectors object
        embedding_matrix[i] = word2vec_model[word]

print(f"Embedding matrix shape: {embedding_matrix.shape}")


Embedding matrix shape: (35, 300)


In [19]:
# 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [20]:
# 5. Define the CNN Model
model = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(1, activation='sigmoid') # Binary classification (positive/negative)
])





In [21]:
# 6. Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [22]:
# 7. Train the Model
epochs = 10
batch_size = 2
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)



Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 334ms/step - accuracy: 0.6111 - loss: 0.7029 - val_accuracy: 0.0000e+00 - val_loss: 0.7339
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 1.0000 - loss: 0.2689 - val_accuracy: 0.0000e+00 - val_loss: 0.8726
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 1.0000 - loss: 0.0980 - val_accuracy: 0.0000e+00 - val_loss: 1.0094
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 1.0000 - loss: 0.0617 - val_accuracy: 0.0000e+00 - val_loss: 1.1202
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 1.0000 - loss: 0.0351 - val_accuracy: 0.0000e+00 - val_loss: 1.2219
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 1.0000 - loss: 0.0154 - val_accuracy: 0.0000e+00 - val_loss: 1.3120
Epoch 7/10
[1m2/2[0

<keras.src.callbacks.history.History at 0x7ee8b5da7a10>

In [23]:
# 8. Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.5000 - loss: 1.2828
Test Loss: 1.2828
Test Accuracy: 0.5000


In [24]:
# Example of making predictions
new_texts = ["This is an amazing product!", "It was a terrible disappointment."]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length, padding='post')
predictions = model.predict(new_padded_sequences)
for text, pred in zip(new_texts, predictions):
    print(f"Text: '{text}', Prediction (Positive Probability): {pred[0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Text: 'This is an amazing product!', Prediction (Positive Probability): 0.6174
Text: 'It was a terrible disappointment.', Prediction (Positive Probability): 0.5788
