# Importing libraries

In [19]:
import re
import csv
import pandas
import sklearn
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer



# Loading the preprocessed train, validation, and test data

In [57]:
# Loading preprocessed train data
X_train = pandas.read_csv("train.csv")
y_train = X_train['spam']
X_train_text = X_train['text']

# Loading preprocessed validation data
X_validation = pandas.read_csv("validation.csv") 
y_validation = X_validation['spam']
X_validation_text = X_validation['text']

# Loading preprocessed test data
X_test = pandas.read_csv("test.csv")
y_test = X_test['spam']
X_test_text = X_test['text']

# Data vectorization

In [21]:
tfidf_vectorizer = TfidfVectorizer(max_features = 50000) # Adjust max_features as needed

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_validation_tfidf = tfidf_vectorizer.transform(X_validation_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

# Training Logistic Regression

In [22]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train_tfidf, y_train)

# Scoring and evaluating Logistic Regression on train data

In [23]:
# Accuracy score
print("Logistic Regression Train Accuracy:", accuracy_score(y_train, logistic_model.predict(X_train_tfidf)))

# Classification report
print("Logistic Regression Train Classification Report:")
print(classification_report(y_train, logistic_model.predict(X_train_tfidf)))

# Confusion matrix
print("Logistic Regression Train Confusion Matrix:")
print(confusion_matrix(y_train, logistic_model.predict(X_train_tfidf)))

Logistic Regression Train Accuracy: 0.9951986032300305
Logistic Regression Train Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3484
           1       1.00      0.98      0.99      1098

    accuracy                           1.00      4582
   macro avg       1.00      0.99      0.99      4582
weighted avg       1.00      1.00      1.00      4582

Logistic Regression Train Confusion Matrix:
[[3481    3]
 [  19 1079]]


# Scoring and evaluating Logistic Regression on validation data

In [24]:
# Accuracy score
print("Logistic Regression Validation Accuracy:", accuracy_score(y_validation, logistic_model.predict(X_validation_tfidf)))

# Classification report
print("Logistic Regression Validation Classification Report:")
print(classification_report(y_validation, logistic_model.predict(X_validation_tfidf)))

# Confusion matrix
print("Logistic Regression Validation Confusion Matrix:")
print(confusion_matrix(y_validation, logistic_model.predict(X_validation_tfidf)))

Logistic Regression Validation Accuracy: 0.9860302677532014
Logistic Regression Validation Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       662
           1       1.00      0.94      0.97       197

    accuracy                           0.99       859
   macro avg       0.99      0.97      0.98       859
weighted avg       0.99      0.99      0.99       859

Logistic Regression Validation Confusion Matrix:
[[662   0]
 [ 12 185]]


# Scoring and evaluating Logistic Regression on test data

In [25]:
# Accuracy score
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, logistic_model.predict(X_test_tfidf)))

# Classification Report
print("Logistic Regression Test Classification Report:")
print(classification_report(y_test, logistic_model.predict(X_test_tfidf)))

# Confusion matrix
print("Logistic Regression Test Confusion Matrix:")
print(confusion_matrix(y_test, logistic_model.predict(X_test_tfidf)))

Logistic Regression Test Accuracy: 0.9790940766550522
Logistic Regression Test Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       214
           1       1.00      0.92      0.96        73

    accuracy                           0.98       287
   macro avg       0.99      0.96      0.97       287
weighted avg       0.98      0.98      0.98       287

Logistic Regression Test Confusion Matrix:
[[214   0]
 [  6  67]]


# Training Random Forest Classifier

In [26]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train_tfidf, y_train)

# Scoring and evaluating Random Forest Classifier on train data

In [29]:
# Accuracy score
print("Random Forest Train Accuracy:", accuracy_score(y_train, random_forest_model.predict(X_train_tfidf)))

# Classification report
print("Random Forest Train Classification Report:")
print(classification_report(y_train, random_forest_model.predict(X_train_tfidf)))

# Confusion matrix
print("Random Forest Train Confusion Matrix:")
print(confusion_matrix(y_train, random_forest_model.predict(X_train_tfidf)))

Random Forest Train Accuracy: 1.0
Random Forest Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3484
           1       1.00      1.00      1.00      1098

    accuracy                           1.00      4582
   macro avg       1.00      1.00      1.00      4582
weighted avg       1.00      1.00      1.00      4582

Random Forest Train Confusion Matrix:
[[3484    0]
 [   0 1098]]


# Scoring and evaluating Random Forest Classifier on validation data

In [28]:
# Accuracy score
print("Random Forest Validation Accuracy:", accuracy_score(y_validation, random_forest_model.predict(X_validation_tfidf)))

# Classification report
print("Random Forest Validation Classification Report:")
print(classification_report(y_validation, random_forest_model.predict(X_validation_tfidf)))

# Confusion matrix
print("Random Forest Validation Confusion Matrix:")
print(confusion_matrix(y_validation, random_forest_model.predict(X_validation_tfidf)))

Random Forest Validation Accuracy: 0.9860302677532014
Random Forest Validation Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       662
           1       0.99      0.94      0.97       197

    accuracy                           0.99       859
   macro avg       0.99      0.97      0.98       859
weighted avg       0.99      0.99      0.99       859

Random Forest Validation Confusion Matrix:
[[661   1]
 [ 11 186]]


# Scoring and evaluating Random Forest Classifier on test data

In [30]:
# Accuracy score
print("Random Forest Test Accuracy:", accuracy_score(y_test, random_forest_model.predict(X_test_tfidf)))

# Classification report
print("Random Forest Test Classification Report:")
print(classification_report(y_test, random_forest_model.predict(X_test_tfidf)))

# Confusion matrix
print("Random Forest Test Confusion Matrix:")
print(confusion_matrix(y_test, random_forest_model.predict(X_test_tfidf)))

Random Forest Test Accuracy: 0.9721254355400697
Random Forest Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       214
           1       1.00      0.89      0.94        73

    accuracy                           0.97       287
   macro avg       0.98      0.95      0.96       287
weighted avg       0.97      0.97      0.97       287

Random Forest Test Confusion Matrix:
[[214   0]
 [  8  65]]


# Training SVC

In [31]:
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)

# Scoring and evaluating SVC on train data

In [32]:
# Accuracy score
print("SVC Train Accuracy:", accuracy_score(y_train, svm_model.predict(X_train_tfidf)))

# Classification report
print("SVC Train Classification Report:")
print(classification_report(y_train, svm_model.predict(X_train_tfidf)))

# confusion matrix
print("SVC Train Confusion Matrix:")
print(confusion_matrix(y_train, svm_model.predict(X_train_tfidf)))

SVC Train Accuracy: 1.0
SVC Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3484
           1       1.00      1.00      1.00      1098

    accuracy                           1.00      4582
   macro avg       1.00      1.00      1.00      4582
weighted avg       1.00      1.00      1.00      4582

SVC Train Confusion Matrix:
[[3484    0]
 [   0 1098]]


# Scoring and evaluating SVC on validation data

In [33]:
# Accuracy score
print("SVC Validation Accuracy:", accuracy_score(y_validation, svm_model.predict(X_validation_tfidf)))

print("SVC Validation Classification Report:")
print(classification_report(y_validation, svm_model.predict(X_validation_tfidf)))

print("SVC Validation Confusion Matrix:")
print(confusion_matrix(y_validation, svm_model.predict(X_validation_tfidf)))

SVC Validation Accuracy: 0.9953434225844005
SVC Validation Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       662
           1       1.00      0.98      0.99       197

    accuracy                           1.00       859
   macro avg       1.00      0.99      0.99       859
weighted avg       1.00      1.00      1.00       859

SVC Validation Confusion Matrix:
[[662   0]
 [  4 193]]


# Scoring and evaluating SVC on test data

In [34]:
# Accuracy score
print("SVC Test Accuracy:", accuracy_score(y_test, svm_model.predict(X_test_tfidf)))

# Classification report
print("SVC Test Classification Report:")
print(classification_report(y_test, svm_model.predict(X_test_tfidf)))

# Confusion matrix
print("SVC Test Confusion Matrix:")
print(confusion_matrix(y_test, svm_model.predict(X_test_tfidf)))

SVC Test Accuracy: 0.9930313588850174
SVC Test Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       214
           1       1.00      0.97      0.99        73

    accuracy                           0.99       287
   macro avg       1.00      0.99      0.99       287
weighted avg       0.99      0.99      0.99       287

SVC Test Confusion Matrix:
[[214   0]
 [  2  71]]


# Choosing the best model

In [36]:
test_accuracies = {
    'Logistic Regression': accuracy_score(y_test, logistic_model.predict(X_test_tfidf)),
    'Random Forest': accuracy_score(y_test, random_forest_model.predict(X_test_tfidf)),
    'SVC': accuracy_score(y_test, svm_model.predict(X_test_tfidf)),
}

best_model_test = max(test_accuracies.items(), key=lambda x: x[1])

print(f"The best model on the test data is: {best_model_test[0]} with Test Accuracy: {best_model_test[1]}")

The best model on the test data is: SVC with Test Accuracy: 0.9930313588850174


# Fine tuning Logistic Regression

In [48]:
# Define the parameter grid for hyperparameter tuning
logistic_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

# Initialize GridSearchCV
logistic_grid_search = GridSearchCV(LogisticRegression(), logistic_param_grid, cv=5, n_jobs=-1)

# Perform grid search cross-validation
logistic_grid_search.fit(X_train_tfidf, y_train)

# Get the best model
best_logistic_model = logistic_grid_search.best_estimator_

# Print the best hyperparameters found
print("Best hyperparameters for Logistic Regression:", logistic_grid_search.best_params_)

# Score on train, validation, and test data using the best model
print("Logistic Regression Train Accuracy after Fine-tuning:", accuracy_score(y_train, best_logistic_model.predict(X_train_tfidf)))
print("Logistic Regression Validation Accuracy after Fine-tuning:", accuracy_score(y_validation, best_logistic_model.predict(X_validation_tfidf)))
print("Logistic Regression Test Accuracy after Fine-tuning:", accuracy_score(y_test, best_logistic_model.predict(X_test_tfidf)))


Best hyperparameters for Logistic Regression: {'C': 10}
Logistic Regression Train Accuracy after Fine-tuning: 1.0
Logistic Regression Validation Accuracy after Fine-tuning: 0.9965075669383003
Logistic Regression Test Accuracy after Fine-tuning: 0.9965156794425087


# Fine tuning Random Forest Classifier

In [40]:
# Define the parameter grid for hyperparameter tuning
random_forest_param_grid = {'n_estimators': [50, 100, 200],
                            'max_depth': [None, 10, 20, 30]}

# Initialize GridSearchCV
random_forest_grid_search = GridSearchCV(RandomForestClassifier(), random_forest_param_grid, cv=5, n_jobs=-1)

# Perform grid search cross-validation
random_forest_grid_search.fit(X_train_tfidf, y_train)

# Get the best model
best_random_forest_model = random_forest_grid_search.best_estimator_

# Print the best hyperparameters found
print("Best hyperparameters for Random Forest:", random_forest_grid_search.best_params_)

# Score on train, validation, and test data using the best model
print("Random Forest Train Accuracy after Fine-tuning:", accuracy_score(y_train, best_random_forest_model.predict(X_train_tfidf)))
print("Random Forest Validation Accuracy after Fine-tuning:", accuracy_score(y_validation, best_random_forest_model.predict(X_validation_tfidf)))
print("Random Forest Test Accuracy after Fine-tuning:", accuracy_score(y_test, best_random_forest_model.predict(X_test_tfidf)))


Best hyperparameters for Random Forest: {'max_depth': None, 'n_estimators': 200}
Random Forest Train Accuracy after Fine-tuning: 1.0
Random Forest Validation Accuracy after Fine-tuning: 0.9837019790454016
Random Forest Test Accuracy after Fine-tuning: 0.9721254355400697


# Fine tuning SVC

In [47]:
# Define the parameter grid for hyperparameter tuning
svm_param_grid = {'C': [0.1, 0.5],
                  'kernel': ['linear', 'rbf', 'poly']}

# Initialize GridSearchCV
svm_grid_search = GridSearchCV(SVC(), svm_param_grid, cv=3, n_jobs=-1)

# Perform grid search cross-validation
svm_grid_search.fit(X_train_tfidf, y_train)

# Get the best model
best_svm_model = svm_grid_search.best_estimator_

# Print the best hyperparameters found
print("Best hyperparameters for SVC:", svm_grid_search.best_params_)

# Score on train, validation, and test data using the best model
print("SVC Train Accuracy after Fine-tuning:", accuracy_score(y_train, best_svm_model.predict(X_train_tfidf)))
print("SVC Validation Accuracy after Fine-tuning:", accuracy_score(y_validation, best_svm_model.predict(X_validation_tfidf)))
print("SVC Test Accuracy after Fine-tuning:", accuracy_score(y_test, best_svm_model.predict(X_test_tfidf)))


Best hyperparameters for SVC: {'C': 0.5, 'kernel': 'linear'}
SVC Train Accuracy after Fine-tuning: 0.9997817546922741
SVC Validation Accuracy after Fine-tuning: 0.9976717112922002
SVC Test Accuracy after Fine-tuning: 0.9965156794425087
