<a href="https://colab.research.google.com/github/SanthoshPollai/CODSOFT/blob/main/CodeSoft_Task_4_SpamSmsDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Name : Pollai Santhosh**

# **Code Soft Task 4 :**

Build an AI model that can classify SMS messages as spam or
legitimate. Use techniques like TF-IDF or word embeddings with
classifiers like Naive Bayes, Logistic Regression, or Support Vector Machines to identify spam messages.

In [28]:
# Importing libraries and dataset

# imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [29]:
df = pd.read_csv('/content/spam.csv', encoding='latin-1')

# **Preprocessing**

In [30]:
# Removing unnecessary columns
df = df[['v1', 'v2']]

# Renameing columns
df.columns = ['Label', 'Message']

print(df.head())  # Displaying the cleaned dataset with new indexes

  Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [31]:
# now we will clean the messages (the "message" column)to imporve the efficiency,
# using tokenization, removing stopwords and punctuation marks
# and performing TF-IDF (vectorizating) in order to fimd the importance of each word in the corpus

In [32]:
# Check for missing values
print(df.isnull().sum()) # no null values

# Check the class distribution
print(df['Label'].value_counts()) # spam vs. ham

Label      0
Message    0
dtype: int64
Label
ham     4825
spam     747
Name: count, dtype: int64


In [33]:
print(df.columns)

Index(['Label', 'Message'], dtype='object')


In [34]:
print(df['Label'].unique())

['ham' 'spam']


In [35]:
print(df.head())

  Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [36]:
df['Label'].head()

Unnamed: 0,Label
0,ham
1,ham
2,spam
3,ham
4,ham


In [37]:
# we'll convert "labels" into binary numbers (spam -> 1, ham -> 0)
df['Label'] = df['Label'].apply(lambda x: 1 if x == 'spam' else 0)

In [38]:
df['Label'].head()

Unnamed: 0,Label
0,0
1,0
2,1
3,0
4,0


In [39]:
df['Message'].head()

Unnamed: 0,Message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


In [40]:
# description's Text cleaning:

import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [41]:
# function definition for removing stop words, punctuations and lemmatization:

def preprocess_text(text):
    """
    Methadology:
    1. Removing punctuation
    2. Converting to lowercase
    3. Removing stop words
    4. Applying lemmatization

    Args:
        text (str): Input text to preprocess.

    Returns:
        str: Preprocessed text.
    """
    # Initialize lemmatizer and stop words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Convert to lowercase and split into words
    words = text.lower().split()

    # Remove stop words and lemmatize
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Join words back into a single string
    return ' '.join(processed_words)

# function call
df['cleaned_message'] = df['Message'].apply(preprocess_text)  ## Apply preprocess_text to each row in the 'Message' column

print(df[['Message', 'cleaned_message']].head())
# message with a small 'm'

                                             Message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned_message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry 2 wkly comp win fa cup final tkts 2...  
3                u dun say early hor u c already say  
4           nah dont think go usf life around though  


In [42]:
# train - test split (before applying vectorization):

X_train, X_test, y_train, y_test = train_test_split(
    df['Message'],  # Features (text data)
    df['Label'],    # Target labels (spam: 1, ham: 0)
    test_size=0.2,    # 20% of data goes to the test set
    random_state=42   # Ensures reproducibility
)

# Print shapes of the resulting datasets
print("Training feature shape:", X_train.shape)
print("Testing feature shape:", X_test.shape)
print("Training label shape:", y_train.shape)
print("Testing label shape:", y_test.shape)

Training feature shape: (4457,)
Testing feature shape: (1115,)
Training label shape: (4457,)
Testing label shape: (1115,)


In [43]:
# appying TF-IDF venctorization to find word importance in the corpus:

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# **Model training:**

In [44]:
# SVM Model
svm_model = SVC(kernel='linear', C=1, probability=True, class_weight='balanced')
svm_model.fit(X_train, y_train)

## **Prediction and Evauation**

In [45]:
# Predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9821

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.90      0.93       150

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

