In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/emailspam/spam.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
# Load the dataset
df = pd.read_csv('/kaggle/input/emailspam/spam.csv', encoding='latin-1')

# Display the first few rows of the dataset
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Drop any irrelevant columns
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Rename columns for clarity
df.columns = ['label', 'message']

# Convert labels to a binary format
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Check for any missing values
df.isnull().sum()


label      0
message    0
dtype: int64

In [5]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)


In [6]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform the text data into TF-IDF features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [7]:
# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train_tfidf, y_train)


In [8]:
# Predict the labels on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9668161434977578

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Confusion Matrix:
 [[965   0]
 [ 37 113]]


In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}

# Initialize the SVM model
svm_model = SVC()

# Initialize GridSearchCV with cross-validation
grid = GridSearchCV(svm_model, param_grid, refit=True, verbose=2, cv=5)

# Fit the model
grid.fit(X_train_tfidf, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters found by GridSearchCV:")
print(grid.best_params_)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.6s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.6s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.6s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.6s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.9s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.0s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   0.5s
[CV] END ....................C=0.1, gamma=0.1, 

In [10]:
# Predict the labels on the test set using the best model
y_pred_svm = grid.best_estimator_.predict(X_test_tfidf)

# Evaluate the fine-tuned model
print("Accuracy of fine-tuned model:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report for fine-tuned model:\n", classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix for fine-tuned model:\n", confusion_matrix(y_test, y_pred_svm))


Accuracy of fine-tuned model: 0.9820627802690582

Classification Report for fine-tuned model:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.90      0.93       150

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix for fine-tuned model:
 [[960   5]
 [ 15 135]]


In [11]:
# Compare Naive Bayes and fine-tuned SVM
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))
print("Fine-tuned SVM Accuracy:", accuracy_score(y_test, y_pred_svm))


Naive Bayes Accuracy: 0.9668161434977578
Fine-tuned SVM Accuracy: 0.9820627802690582
