In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
df=pd.read_csv('/content/spamraw.csv')


In [None]:
df.head()

Unnamed: 0,type,text
0,0,Hope you are having a good week. Just checking in
1,0,K..give back my thanks.
2,0,Am also doing in cbe only. But have to pay.
3,1,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,1,okmail: Dear Dave this is your final notice to...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5559 entries, 0 to 5558
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    5559 non-null   int64 
 1   text    5559 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.0+ KB


In [None]:
df.describe()

Unnamed: 0,type
count,5559.0
mean,0.134377
std,0.341087
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [None]:
df['type']=df['type'].map({'ham':0,'spam':1})

In [None]:
X = df['text']
y = df['type']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Naive Bayes

In [None]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train)
nb_preds = nb_clf.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))


Naive Bayes Accuracy: 0.9757194244604317


# Logistic Regression

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf, y_train)
lr_preds = lr_clf.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))

Logistic Regression Accuracy: 0.9649280575539568


# Support Vector Machine

In [None]:
svm_clf = SVC()
svm_clf.fit(X_train_tfidf, y_train)
svm_preds = svm_clf.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, svm_preds))

SVM Accuracy: 0.9865107913669064


In [None]:
print("Comparison of model accuracies:")
print(f"Naive Bayes: {accuracy_score(y_test, nb_preds)}")
print(f"Logistic Regression: {accuracy_score(y_test, lr_preds)}")
print(f"Support Vector Machine: {accuracy_score(y_test, svm_preds)}")

Comparison of model accuracies:
Naive Bayes: 0.9757194244604317
Logistic Regression: 0.9649280575539568
Support Vector Machine: 0.9865107913669064


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)
print("Best Parameters for SVM:", grid_search.best_params_)
best_svm = grid_search.best_estimator_
svm_preds = best_svm.predict(X_test_tfidf)
print("Tuned SVM Accuracy:", accuracy_score(y_test, svm_preds))


Best Parameters for SVM: {'C': 10, 'kernel': 'linear'}
Tuned SVM Accuracy: 0.9901079136690647


In [None]:
from gensim.models import Word2Vec
import numpy as np

# Train Word2Vec on your corpus or load a pre-trained model
sentences = [text.split() for text in df['text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Generate averaged embeddings for each message
def average_word_vectors(text, model, vector_size):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(vector_size)

X_train_w2v = np.array([average_word_vectors(text, word2vec_model, 100) for text in X_train])
X_test_w2v = np.array([average_word_vectors(text, word2vec_model, 100) for text in X_test])

# Train classifier on word2vec embeddings
svm_clf.fit(X_train_w2v, y_train)
svm_preds_w2v = svm_clf.predict(X_test_w2v)
print("SVM Accuracy with Word2Vec embeddings:", accuracy_score(y_test, svm_preds_w2v))


SVM Accuracy with Word2Vec embeddings: 0.8803956834532374


In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print("SVM F1-Score:", f1_score(y_test, svm_preds, pos_label=1))
print("SVM Precision:", precision_score(y_test, svm_preds, pos_label=1))
print("SVM Recall:", recall_score(y_test, svm_preds, pos_label=1))


SVM F1-Score: 0.9571984435797666
SVM Precision: 0.9919354838709677
SVM Recall: 0.924812030075188


In [None]:
from sklearn.ensemble import VotingClassifier

ensemble_clf = VotingClassifier(
    estimators=[('nb', nb_clf), ('lr', lr_clf), ('svm', svm_clf)],
    voting='hard'
)
ensemble_clf.fit(X_train_tfidf, y_train)
ensemble_preds = ensemble_clf.predict(X_test_tfidf)
print("Ensemble Model Accuracy:", accuracy_score(y_test, ensemble_preds))


Ensemble Model Accuracy: 0.9829136690647482


In [None]:
svm_clf = SVC(class_weight='balanced')


In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Assuming X_train_tfidf, y_train are already prepared
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
svm_clf = SVC()
svm_clf.fit(X_train_tfidf, y_train)

# Save the model and vectorizer to disk
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('spam_classifier_model.pkl', 'wb') as f:
    pickle.dump(svm_clf, f)
