In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
# Load the data
data_path = '/kaggle/input/spam-filter-for-quora-questions/train.csv'
data = pd.read_csv(data_path)

In [3]:
# Inspect the dataset
print("Dataset Overview:")
print(data.head())
print("\nDataset Info:")
data.info()

Dataset Overview:
                    qid                                      question_text  \
0  00002165364db923c7e6  How did Quebec nationalists see their province...   
1  000032939017120e6e44  Do you have an adopted dog, how would you enco...   
2  0000412ca6e4628ce2cf  Why does velocity affect time? Does velocity a...   
3  000042bf85aa498cd78e  How did Otto von Guericke used the Magdeburg h...   
4  0000455dfa3e01eae3af  Can I convert montra helicon D to a mountain b...   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   qid            1306122 non-null  object
 1   question_text  1306122 non-null  object
 2   target         1306122 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 29.9+ MB


In [4]:
# Preprocess the data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

data['question_text'] = data['question_text'].apply(clean_text)

In [5]:
# Extract features and labels
X = data['question_text']
y = data['target']  # Assuming 'target' column contains the labels

In [6]:
# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Tokenize and pad sequences
max_words = 20000  # Vocabulary size
max_len = 100  # Maximum length of sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

In [8]:
# !wget http://nlp.stanford.edu/data/glove.42B.300d.zip

In [9]:
# !unzip /kaggle/working/glove.42B.300d.zip
# !rm /kaggle/working/glove.42B.300d.zip

In [10]:
# Load GloVe embeddings
glove_path = '/kaggle/input/glove-42b-300de/glove.42B.300d.txt'  # Replace with your GloVe file path
embedding_dim = 100

embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")

Loaded 1917494 word vectors from GloVe.


In [11]:
# Create embedding matrix
word_index = tokenizer.word_index
# Update the embedding dimension to match the GloVe file (e.g., 300)
embedding_dim = 300

# Create embedding matrix
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [12]:
# Build the model
model = Sequential([
    Embedding(input_dim=max_words, 
              output_dim=embedding_dim, 
              weights=[embedding_matrix], 
              input_length=max_len, 
              trainable=False),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())



None


In [13]:
# Train the model
batch_size = 32
epochs = 5

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    batch_size=batch_size,
    epochs=epochs,
    verbose=1
)

Epoch 1/5
[1m32654/32654[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4264s[0m 130ms/step - accuracy: 0.9495 - loss: 0.1326 - val_accuracy: 0.9582 - val_loss: 0.1059
Epoch 2/5
[1m32654/32654[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4254s[0m 130ms/step - accuracy: 0.9566 - loss: 0.1102 - val_accuracy: 0.9578 - val_loss: 0.1061
Epoch 3/5
[1m  956/32654[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:03:28[0m 120ms/step - accuracy: 0.9578 - loss: 0.1092

KeyboardInterrupt: 

In [14]:
# Evaluate the model
val_pred = (model.predict(X_val_pad) > 0.5).astype(int)
print("Classification Report:")
print(classification_report(y_val, val_pred))

[1m8164/8164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 40ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98    245369
           1       0.70      0.57      0.63     15856

    accuracy                           0.96    261225
   macro avg       0.84      0.78      0.80    261225
weighted avg       0.96      0.96      0.96    261225

