In [None]:
#### Relu activation function

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout

# Load the dataset
data = pd.read_csv("C:\\Users\\ASUS\\Desktop\\capstone vickey\\spam.csv", encoding='latin1')

# Check the columns
print(data.columns)

# Assume the columns are 'v1' for labels and 'v2' for text
data = data[['v1', 'v2']]
data.columns = ['Label', 'Text']

# Encode the labels
encoder = LabelEncoder()
data['Label'] = encoder.fit_transform(data['Label'])

# Tokenize the text
max_words = 5000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['Text'])
sequences = tokenizer.texts_to_sequences(data['Text'])

# Pad sequences to ensure uniform input length
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
y = data['Label'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_sequence_length))
model.add(SimpleRNN(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='relu'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Make predictions
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)

# Print some example predictions
for i in range(10):
    print(f'Actual: {y_test[i]}, Predicted: {predictions[i][0]}')


Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')




Epoch 1/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step - accuracy: 0.8777 - loss: 0.7686 - val_accuracy: 0.9552 - val_loss: 0.2941
Epoch 2/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9768 - loss: 0.1829 - val_accuracy: 0.9753 - val_loss: 0.2091
Epoch 3/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9880 - loss: 0.1060 - val_accuracy: 0.9742 - val_loss: 0.2008
Epoch 4/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9926 - loss: 0.0724 - val_accuracy: 0.9787 - val_loss: 0.1763
Epoch 5/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9959 - loss: 0.0578 - val_accuracy: 0.9843 - val_loss: 0.1513
Epoch 6/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9510 - loss: 0.2003 - val_accuracy: 0.9563 - val_loss: 0.4015
Epoch 7/10
[1m112/112

In [2]:
# Function to predict on new data
def predict_new_data(new_data, model, tokenizer, max_sequence_length):
    # Convert all entries in the 'Text' column to strings
    new_data['Text'] = new_data['Text'].astype(str)
    
    # Tokenize and pad the new data
    new_sequences = tokenizer.texts_to_sequences(new_data['Text'])
    new_X = pad_sequences(new_sequences, maxlen=max_sequence_length)
    
    # Make predictions
    new_predictions = model.predict(new_X)
    new_predictions = (new_predictions > 0.5).astype(int)
    
    return new_predictions

# Function to calculate accuracy on new data
def calculate_accuracy(true_labels, predicted_labels):
    correct = sum(true_labels == predicted_labels)
    total = len(true_labels)
    accuracy = correct / total
    return accuracy

# Example usage for new dataset
# Assuming you have a new dataset in the same format as the original
new_data = pd.read_csv("C:\\Users\\ASUS\\Desktop\\capstone vickey\\new dataset of spam deduction\\spam_or_not_spam.csv", encoding='latin1')
new_data = new_data[['v1', 'v2']]
new_data.columns = ['Label', 'Text']

# Remove rows with NaN values
new_data = new_data.dropna()

# Encode the labels for the new dataset
new_data['Label'] = encoder.transform(new_data['Label'])

# Make predictions on the new dataset
new_predictions = predict_new_data(new_data, model, tokenizer, max_sequence_length)

# Calculate accuracy on the new dataset
new_accuracy = calculate_accuracy(new_data['Label'].values, new_predictions.flatten())
print(f'New Dataset Accuracy: {new_accuracy:.4f}')

# Print some example predictions from the new dataset
for i in range(min(10, len(new_data))):
    print(f'Actual: {new_data["Label"].values[i]}, Predicted: {new_predictions[i][0]}')

[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
New Dataset Accuracy: 0.7624
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0


In [None]:
# GRU model

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

# Load the dataset
data = pd.read_csv("C:\\Users\\ASUS\\Desktop\\capstone vickey\\spam.csv", encoding='latin1')

# Check the columns
print(data.columns)

# Assume the columns are 'v1' for labels and 'v2' for text
data = data[['v1', 'v2']]
data.columns = ['Label', 'Text']

# Encode the labels
encoder = LabelEncoder()
data['Label'] = encoder.fit_transform(data['Label'])

# Tokenize the text
max_words = 5000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['Text'])
sequences = tokenizer.texts_to_sequences(data['Text'])

# Pad sequences to ensure uniform input length
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
y = data['Label'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Build the GRU model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_sequence_length))
model.add(GRU(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Make predictions
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)

# Print some example predictions
for i in range(10):
    print(f'Actual: {y_test[i]}, Predicted: {predictions[i][0]}')

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')




Epoch 1/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 70ms/step - accuracy: 0.8516 - loss: 0.4574 - val_accuracy: 0.9701 - val_loss: 0.1070
Epoch 2/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - accuracy: 0.9798 - loss: 0.0691 - val_accuracy: 0.9821 - val_loss: 0.0593
Epoch 3/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - accuracy: 0.9951 - loss: 0.0204 - val_accuracy: 0.9791 - val_loss: 0.0903
Epoch 4/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - accuracy: 0.9991 - loss: 0.0038 - val_accuracy: 0.9806 - val_loss: 0.0779
Epoch 5/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 56ms/step - accuracy: 1.0000 - loss: 0.0021 - val_accuracy: 0.9821 - val_loss: 0.0859
Epoch 6/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 56ms/step - accuracy: 1.0000 - loss: 9.1941e-04 - val_accuracy: 0.9821 - val_loss: 0.1014
Epoch 7/10
[1m84/84[0m [32

In [5]:
# Function to predict on new data
def predict_new_data(new_data, model, tokenizer, max_sequence_length):
    # Convert all entries in the 'Text' column to strings
    new_data['Text'] = new_data['Text'].astype(str)
    
    # Tokenize and pad the new data
    new_sequences = tokenizer.texts_to_sequences(new_data['Text'])
    new_X = pad_sequences(new_sequences, maxlen=max_sequence_length)
    
    # Make predictions
    new_predictions = model.predict(new_X)
    new_predictions = (new_predictions > 0.5).astype(int)
    
    return new_predictions

# Function to calculate accuracy on new data
def calculate_accuracy(true_labels, predicted_labels):
    correct = sum(true_labels == predicted_labels)
    total = len(true_labels)
    accuracy = correct / total
    return accuracy

# Example usage for new dataset
# Assuming you have a new dataset in the same format as the original
new_data = pd.read_csv("C:\\Users\\ASUS\\Desktop\\capstone vickey\\new dataset of spam deduction\\spam_or_not_spam.csv", encoding='latin1')
new_data = new_data[['v1', 'v2']]
new_data.columns = ['Label', 'Text']

# Remove rows with NaN values
new_data = new_data.dropna()

# Encode the labels for the new dataset
new_data['Label'] = encoder.transform(new_data['Label'])

# Make predictions on the new dataset
new_predictions = predict_new_data(new_data, model, tokenizer, max_sequence_length)

# Calculate accuracy on the new dataset
new_accuracy = calculate_accuracy(new_data['Label'].values, new_predictions.flatten())
print(f'New Dataset Accuracy: {new_accuracy:.4f}')

# Print some example predictions from the new dataset
for i in range(min(10, len(new_data))):
    print(f'Actual: {new_data["Label"].values[i]}, Predicted: {new_predictions[i][0]}')

[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
New Dataset Accuracy: 0.6666
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 0


In [None]:
# faltuuuu dekh mat 

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [17]:
A = cv.fit_transform(data['Transformed-Text']).toarray()

KeyError: 'Transformed-Text'

In [None]:
# bol bhi reha hu mat dekh fir bhi...................................!!!!!!!!!!!!!!!!!!!!!!!!

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [14]:
A_train,A_test,b_train,b_test = train_test_split(A,b,test_size=0.2,random_state=2)

NameError: name 'A' is not defined

In [15]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [11]:
mnb = MultinomialNB()

In [12]:
mnb.fit(A_train,b_train)
b_pred2 = mnb.predict(A_test)
print(accuracy_score(b_test,b_pred2))
print(confusion_matrix(b_test,b_pred2))
print(precision_score(b_test,b_pred2))

NameError: name 'A_train' is not defined