In [1]:
import pandas as pd
import numpy as np

In [21]:
df=pd.read_csv("D:/Learning/Spam_Detection_Project/spam.csv", encoding='latin-1')[['v1','v2']]
df.columns=('label','Message')

print(df.head())

  label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [23]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)   #Remove duplicates and null

df['Message']=df['Message'].str.strip().str.lower()   #clean Msg Column

print(f"Total msg after cleaning: {len(df)}")

Total msg after cleaning: 5169


In [25]:
print(df['label'].value_counts())

label
ham     4516
spam     653
Name: count, dtype: int64


In [27]:
from collections import Counter
import re

spam_msg = df[df['label']=='spam']['Message']   #Filter only spam messages
all_spam_text="".join(spam_msg)  # Join all spam messages

# Tokenize words (basic split on whitespace and non-alphabetic characters)
words=re.findall(r'\b[a-z]{3}\b', all_spam_text) #words with 3+ characters

#Count Frequency
common_words=Counter(words).most_common(20)
print("Top 20 words in spam messages:")
for word, count in common_words:
    print(f"{word} : {count}")

Top 20 words in spam messages:
you : 225
the : 182
for : 178
now : 160
txt : 133
and : 104
www : 83
our : 76
are : 69
get : 65
new : 62
won : 61
out : 48
msg : 47
win : 45
per : 41
who : 41
com : 40
min : 36
has : 28


In [29]:
import re

# Define spam keywords
spam_keywords = [
    'win', 'winner', 'won', 'free', 'cash', 'urgent', 'prize',
    'congratulations', 'offer', 'credit', 'click', 'buy', 'call now',
    'selected', 'guaranteed', 'claim', 'subscribe', 'cheap', 'money'
]

# Define the spam scoring function
def compute_spam_score(message):
    score = 0
    msg = message.lower()

    # 1. Keyword match
    for keyword in spam_keywords:
        if keyword in msg:
            score += 2

    # 2. Check for links (corrected regex)
    if re.search(r'http\S+|www\.\S+|\.com', msg):
        score += 2

    # 3. Multiple exclamation marks
    if re.search(r'!{2,}', message):
        score += 1

    # 4. Presence of large numbers (e.g., 1000, 50000)
    if re.search(r'\d{3,}', msg):
        score += 1

    # 5. ALL CAPS words
    if re.search(r'\b[A-Z]{3,}\b', message):
        score += 1

    return score


In [43]:
df['spam_score']= df['Message'].apply(compute_spam_score)
# Add a new column with spam score

# Define threshold: if score >= 4, classify as spam
df['predicted_label'] = df['spam_score'].apply(lambda x: 'spam' if x>=4 else 'ham')

In [45]:
print (df.columns)

Index(['label', 'Message', 'spam_score', 'predefined_label',
       'predicted_label'],
      dtype='object')


In [47]:
# import metrics

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [49]:
# Calculate Performance Metrics

# Get actual and predicted labels
y_true = df['label']
y_pred = df['predicted_label']

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, pos_label='spam')
recall = recall_score(y_true, y_pred, pos_label='spam')
f1 = f1_score(y_true, y_pred, pos_label='spam')

# Print results
print(f"Accuracy:  {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")


Accuracy:  0.91
Precision: 0.90
Recall:    0.36
F1 Score:  0.51


In [57]:
df[['Message','label','spam_score','predefined_label']].to_csv('Spam_final.csv', index=False)