<a href="https://colab.research.google.com/github/Tirthankar4/Encryptix/blob/main/Encryptix_task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'sms-spam-collection-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F483%2F982%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240726%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240726T113004Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0a91077a4e1750fd52e3078cccaa006a33e62efa6700bd57a8dee907301cee85f95d4b2bea785e4109724e60195e42ffa5cd4ec6018b650c302013ea59c71a07bd24ff7842d83b493924bac72e515681874daceb19b2252ba8aab8621f5ac0bbadbbc3a847cb517f32a2031a87cd22ce8c90e616a94efb6f56aba1a3066fda5fd84b204fb56ee54211d3c18dd0364849b3b7251a363c9fb78a913425a81e16aad5ec8c3fbcbcd0fe61fdfb6bd721d6f9871a78a5543293eed6e699b7ae180814fb109b4db030757d2c0759179dc391fbf5173ec74ec2177ec9db17275bba2569b0726f64928561bf3f7e41277ac49b825568cb9115f69a259f705a68c3ee3602'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Load libraries

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.metrics import f1_score

Dataset loading and initial preprocessing

In [None]:
X = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding = 'latin-1')

In [None]:
y = X.pop('v1')
X.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1, inplace = True)

download nltk corpus stopwords

In [None]:
nltk.download('stopwords')

stopwords = stopwords.words('english')

pattern = r'\w+'
regexp = RegexpTokenizer(pattern)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


text processing for TF-IDF

In [None]:
X['v2'] = X['v2'].apply(lambda x: regexp.tokenize(x))
X['v2'] = X['v2'].apply(lambda x: ' '.join(x))
X['v2'] = X['v2'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in (stopwords)]))

X['v2'] = X['v2'].apply(lambda x: x.replace('å', ''))
X['v2'] = X['v2'].apply(lambda x: x.lower())

dividing dataset for training and evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

TF-IDF

In [None]:
vectorizer = TfidfVectorizer()

tfidf_train = vectorizer.fit_transform(X_train['v2'])
train_names = vectorizer.get_feature_names_out()
train_tfidf = pd.DataFrame(tfidf_train.toarray(), columns=train_names)

tfidf_test = vectorizer.transform(X_test['v2'])
test_names = vectorizer.get_feature_names_out()
test_tfidf = pd.DataFrame(tfidf_test.toarray(), columns=test_names)

Baseline Logisitic Regression to compare performance

In [None]:
lr_model = LogisticRegression()
lr_model.fit(tfidf_train, y_train)
lr_preds = lr_model.predict(tfidf_test)
print("Classification Report of Logistic Regression model:")
print(classification_report(y_test, lr_preds))

Classification Report of Logistic Regression model:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       953
        spam       0.99      0.75      0.85       162

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.96      0.96      0.96      1115



Baseline Naive Bayes model for comparing performance

In [None]:
nb_model = ComplementNB()
nb_model.fit(tfidf_train, y_train)
nb_preds = nb_model.predict(tfidf_test)
print("Classification Report of Complement Naive Bayes model:")
print(classification_report(y_test, nb_preds))

Classification Report of Complement Naive Bayes model:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       953
        spam       0.89      0.91      0.90       162

    accuracy                           0.97      1115
   macro avg       0.94      0.95      0.94      1115
weighted avg       0.97      0.97      0.97      1115



Hyperparamater tuning of SVM model

In [None]:
def objective(trial):
    param = {
        'C': trial.suggest_float('C', 0.0, 10.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'tol': trial.suggest_float('tol', 1e-5, 1e-2, log = True),
        'random_state': 69
        }

    svm_model = SVC(**param)
    svm_model.fit(tfidf_train, y_train)
    svm_preds = svm_model.predict(tfidf_test)
    score = f1_score(y_test, svm_preds, average = "macro")
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-07-26 11:28:21,241] A new study created in memory with name: no-name-629d9b4f-fc19-4c7e-964d-675561b69f39
[I 2024-07-26 11:28:23,322] Trial 0 finished with value: 0.957870764749526 and parameters: {'C': 3.002733674951925, 'gamma': 0.9070465343848927, 'tol': 0.00017324258807931945}. Best is trial 0 with value: 0.957870764749526.
[I 2024-07-26 11:28:24,823] Trial 1 finished with value: 0.957870764749526 and parameters: {'C': 8.027548133931322, 'gamma': 0.8322227887448127, 'tol': 0.0027384364914389307}. Best is trial 0 with value: 0.957870764749526.
[I 2024-07-26 11:28:26,331] Trial 2 finished with value: 0.957870764749526 and parameters: {'C': 4.978938426723346, 'gamma': 0.82630031735181, 'tol': 0.0012885087842155346}. Best is trial 0 with value: 0.957870764749526.
[I 2024-07-26 11:28:27,378] Trial 3 finished with value: 0.5096801827527127 and parameters: {'C': 1.037214103677423, 'gamma': 0.026722679211974354, 'tol': 0.00566544058458187}. Best is trial 0 with value: 0.95787076474

Optimal SVM model

In [None]:
svm_params = study.best_params
svm_model = SVC(**svm_params)
svm_model.fit(tfidf_train, y_train)
svm_preds = svm_model.predict(tfidf_test)
print("Classification Report of Support Vector Machine model:")
print(classification_report(y_test, svm_preds))

Classification Report of Support Vector Machine model:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       953
        spam       1.00      0.90      0.94       162

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.98      0.98      1115

