<a href="https://colab.research.google.com/github/RiyaKhushiRadha/CodSoft-Internship-Projects/blob/main/SMS_Spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset?dataset_version_number=1...


100%|██████████| 211k/211k [00:00<00:00, 57.8MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1





In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin1')
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [None]:
print(df.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [None]:
df = df[['v1', 'v2']]  # Keep only the useful columns
df.columns = ['label', 'text']  # Rename them

In [None]:
#Convert Labels to Numbers
#Machine learning models work with numbers, not text labels:

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [None]:
#Clean the Text Data
#Remove punctuation, lowercase everything, and (optionally) remove stopwords:

import string

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(clean_text)

In [None]:
from sklearn.model_selection import train_test_split

#Split your data into training and testing sets to evaluate your model’s performance:

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [None]:
#Convert text into numbers using CountVectorizer:

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
#Train a simple and effective model (Multinomial Naive Bayes):

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [None]:
# Check how well your model works:

from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
print("R2 Score:", r2_score(y_test, y_pred))

Accuracy: 0.9820627802690582
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

RMSE: 0.133929906036485
R2 Score: 0.8459412780656305


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9748878923766816
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [None]:
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train_vec, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.979372197309417
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.85      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
from sklearn.preprocessing import LabelEncoder

# Re-create and fit the encoder
encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])  # This creates mapping like ham → 0, spam → 1

In [None]:
def predict_message(msg):
    # Clean the message (same as what you used before)
    msg_clean = msg.lower().translate(str.maketrans('', '', string.punctuation))

    # Vectorize
    msg_vec = vectorizer.transform([msg_clean])

    prediction = model.predict(msg_vec)

    label = encoder.inverse_transform(prediction)[0]

    return label

In [None]:
print(predict_message("Congratulations! You've won a free cruise. Call now!"))
print(predict_message("Are we still meeting tomorrow?"))

1
0
