In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
## Step 1: Install and Import Dependencies
!pip install --upgrade pip setuptools wheel
!pip install --no-cache-dir pandas numpy scikit-learn nltk seaborn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # Download the punkt_tab resource for sentence tokenization




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [20]:

## Step 2: Load the Data from File
file_path = "/content/drive/MyDrive/Colab Notebooks/dataset_sms_fraud/spam.csv"  # Updated path to uploaded file
df = pd.read_csv(file_path, encoding='latin1')

# Select only relevant columns
df = df.iloc[:, :2]  # Keeping only first two columns
df.columns = ['label', 'message']  # Renaming columns

# Inspect the first few rows
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:

## Step 3: Data Preprocessing
nltk.download('punkt')  # Ensure punkt tokenizer is available
nltk.download('stopwords')  # Ensure stopwords are available

def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize words
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
    stop_words = set(stopwords.words('english'))  # Ensure stopwords are loaded properly
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['processed_text'] = df['message'].apply(preprocess_text)

# Display processed data
df.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,message,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts may...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


In [22]:

## Step 4: Feature Extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])
y = df['label'].map({'ham': 0, 'spam': 1})  # Ensure correct mapping

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
## Step 5: Model Training
# Choose classifier: Naive Bayes, Logistic Regression, or SVM
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(kernel='linear')
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Accuracy ({name}):", accuracy_score(y_test, y_pred))
    print(f"Classification Report ({name}):\n", classification_report(y_test, y_pred))
    print(f"Confusion Matrix ({name}):\n", confusion_matrix(y_test, y_pred))

## St

Training Naive Bayes...
Accuracy (Naive Bayes): 0.9650224215246637
Classification Report (Naive Bayes):
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.74      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix (Naive Bayes):
 [[965   0]
 [ 39 111]]
Training Logistic Regression...
Accuracy (Logistic Regression): 0.9443946188340807
Classification Report (Logistic Regression):
               precision    recall  f1-score   support

           0       0.94      0.99      0.97       965
           1       0.95      0.62      0.75       150

    accuracy                           0.94      1115
   macro avg       0.95      0.81      0.86      1115
weighted avg       0.94      0.94      0.94      1115

Confusion Matrix (Logistic Regression):
 [[960   5]
 [ 5

In [31]:
## Step 6: Test with New SMS
sample_sms = ["Please find the meeting agenda attached. Let me know your thoughts."]
sample_sms_processed = vectorizer.transform(sample_sms)

for name, model in models.items():
    prediction = model.predict(sample_sms_processed)
    print(f"Prediction ({name}):", "Spam" if prediction[0] == 1 else "Ham")


Prediction (Naive Bayes): Ham
Prediction (Logistic Regression): Ham
Prediction (SVM): Ham
