# Importing libraries

In [1]:
import pandas as pd

In [3]:
encodings = ['utf-8', 'ISO-8859-1', 'cp1252']
for encoding in encodings:
    try:
        data = pd.read_csv("C:\\Users\\rohit\\OneDrive\\Desktop\\Codsoft 2\\Spam detection\\spam.csv", encoding=encoding)
        print("File read successfully with encoding:", encoding)
        break
    except UnicodeDecodeError:
        print("Failed to read the file with encoding:", encoding)
#We try specifying a different encoding when reading the CSV file. Common alternatives to UTF-8 include 'ISO-8859-1'

Failed to read the file with encoding: utf-8
File read successfully with encoding: ISO-8859-1


# Data Processing

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [5]:
data = pd.read_csv("C:\\Users\\rohit\\OneDrive\\Desktop\\Codsoft 2\\Spam detection\\spam.csv", encoding="ISO-8859-1")
print(data.head())
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['v1'])
X_train, X_test, y_train, y_test = train_test_split(data['v2'], data['label'], test_size=0.2, random_state=42)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


The above code performs the following data processing steps:

1. Load the dataset using pandas.

2. Convert the categorical labels (ham/spam) to numerical format (0/1) using LabelEncoder.

3. Split the dataset into training and testing sets.

4. Perform TF-IDF vectorization on the text data using scikit-learn's TfidfVectorizer.

5. Print the shapes of the TF-IDF matrices for both the training and testing sets.

In [6]:
# The dataset has some additional columns (Unnamed: 2, Unnamed: 3, Unnamed: 4) that don't contain any useful information.
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [7]:
data.head()

Unnamed: 0,v1,v2,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
#V1 too can be dropped since we have already converted it to numerical form required by the model in "label".
data.drop('v1', axis=1, inplace=True)

# Model Selection

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['v2'], data['label'], test_size=0.2, random_state=42)

### We start with Naive Bayes

In [None]:
naive_bayes_classifier = MultinomialNB()
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
naive_bayes_classifier.fit(X_train_tfidf, y_train)
nb_predictions = naive_bayes_classifier.predict(X_test_tfidf)

In [10]:
#Evaluating the performance
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions)
nb_recall = recall_score(y_test, nb_predictions)
nb_f1 = f1_score(y_test, nb_predictions)

print("Naive Bayes Classifier Performance:")
print("Accuracy:", nb_accuracy)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("F1-score:", nb_f1)

Naive Bayes Classifier Performance:
Accuracy: 0.9668161434977578
Precision: 1.0
Recall: 0.7533333333333333
F1-score: 0.8593155893536121


### Logistic Regression

In [11]:
logistic_regression_classifier = LogisticRegression(max_iter=1000)
logistic_regression_classifier.fit(X_train_tfidf, y_train)
lr_predictions = logistic_regression_classifier.predict(X_test_tfidf)

In [12]:
#Evaluating its performance
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions)
lr_recall = recall_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)

print("\nLogistic Regression Classifier Performance:")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1-score:", lr_f1)


Logistic Regression Classifier Performance:
Accuracy: 0.9524663677130045
Precision: 0.970873786407767
Recall: 0.6666666666666666
F1-score: 0.7905138339920948


### SVM

In [14]:
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

In [15]:
#Evaluating its performance
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)
svm_f1 = f1_score(y_test, svm_predictions)

print("\nSupport Vector Machine Classifier Performance:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1-score:", svm_f1)


Support Vector Machine Classifier Performance:
Accuracy: 0.9766816143497757
Precision: 0.9920634920634921
Recall: 0.8333333333333334
F1-score: 0.9057971014492753


# Conclusion

Now we write the final code that compares their accuracies and prints which one is the best.

In [17]:
print("Naive Bayes Classifier Performance:")
print("Accuracy:", nb_accuracy)
print("\nLogistic Regression Classifier Performance:")
print("Accuracy:", lr_accuracy)
print("\nSupport Vector Machine Classifier Performance:")
print("Accuracy:", svm_accuracy)


best_classifier = max([(nb_accuracy, "Naive Bayes"), (lr_accuracy, "Logistic Regression"), (svm_accuracy, "Support Vector Machine")])
best_accuracy, best_classifier_name = best_classifier

print("\nThe best performing classifier is:", best_classifier_name, "with accuracy:", best_accuracy)


Naive Bayes Classifier Performance:
Accuracy: 0.9668161434977578

Logistic Regression Classifier Performance:
Accuracy: 0.9524663677130045

Support Vector Machine Classifier Performance:
Accuracy: 0.9766816143497757

The best performing classifier is: Support Vector Machine with accuracy: 0.9766816143497757
