In [3]:
# Step 1: Install & Import Libraries
!pip install -q tensorflow pandas numpy scikit-learn

import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Step 2: Load Dataset (Assumes file is already uploaded to Colab)
data_path = "/content/train.csv"
print(f"📄 Loading dataset from: {data_path}")
df = pd.read_csv(data_path)

# Step 3: Clean text
def clean_text(text):
    text = str(text).lower()
    return re.sub(f"[{re.escape(string.punctuation)}]", "", text)

print("🧹 Cleaning text data...")
df['cleaned_text'] = df['question_text'].apply(clean_text)

# Step 4: Tokenize & Pad
MAX_NUM_WORDS = 50000
MAX_SEQUENCE_LENGTH = 100

print("🔢 Tokenizing and padding sequences...")
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y = df['target'].values

# Step 5: Load GloVe Embeddings from local file
glove_path = "/content/glove.6B.100d.txt"
print(f"📄 Loading GloVe embeddings from: {glove_path}")

embedding_index = {}
with open(glove_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

EMBEDDING_DIM = 100
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Step 6: Train/Test Split
print("🧪 Splitting train and validation sets...")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Build Model
print("🔧 Building the model...")
model = Sequential()
model.add(Embedding(input_dim=num_words,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Step 8: Train Model
print("🚀 Training model...")
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=128)

# Step 9: Evaluate
print("📊 Evaluating on validation set...")
loss, accuracy = model.evaluate(X_val, y_val)
print(f"✅ Validation Accuracy: {accuracy * 100:.2f}%")

# Step 10: Save Model
print("💾 Saving model to quora_spam_model.h5")
model.save("quora_spam_model.h5")


📄 Loading dataset from: /content/train.csv
🧹 Cleaning text data...
🔢 Tokenizing and padding sequences...
📄 Loading GloVe embeddings from: /content/glove.6B.100d.txt
🧪 Splitting train and validation sets...
🔧 Building the model...




🚀 Training model...
Epoch 1/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1694s[0m 207ms/step - accuracy: 0.9461 - loss: 0.1474 - val_accuracy: 0.9551 - val_loss: 0.1153
Epoch 2/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1711s[0m 208ms/step - accuracy: 0.9538 - loss: 0.1187 - val_accuracy: 0.9570 - val_loss: 0.1117
Epoch 3/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1707s[0m 209ms/step - accuracy: 0.9553 - loss: 0.1143 - val_accuracy: 0.9580 - val_loss: 0.1082
Epoch 4/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1706s[0m 209ms/step - accuracy: 0.9561 - loss: 0.1118 - val_accuracy: 0.9584 - val_loss: 0.1065
Epoch 5/5
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1704s[0m 209ms/step - accuracy: 0.9566 - loss: 0.1102 - val_accuracy: 0.9587 - val_loss: 0.1064
📊 Evaluating on validation set...
[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 29ms/step - accuracy: 0.9583 - loss:



✅ Validation Accuracy: 95.87%
💾 Saving model to quora_spam_model.h5


In [4]:
from google.colab import files
files.download("quora_spam_model.h5")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
import pickle

# Save tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

files.download('tokenizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>