<a href="https://colab.research.google.com/github/Simeen19/ScamDetection_Data_cleaninig/blob/main/ScamDetection_Data_cleaninig.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

--------------------------------  Step 1: Loading the dataset  --------------------------------

In [1]:
import pandas as pd

# Load the text file like a CSV using tab (`\t`) as separator
df = pd.read_csv("/content/SMSSpamCollection", sep="\t", header=None, names=["label", "text"])

# Display the first few rows
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
import csv

# Load the dataset while skipping problematic lines
enron_data = pd.read_csv('/content/emails.csv', encoding='latin-1', quoting=csv.QUOTE_NONE, on_bad_lines='skip')

# Preview the dataset
enron_data.head()


Unnamed: 0,"""file""","""message"""
0,"""allen-p/_sent_mail/1.""","""Message-ID: <18782981.1075855378110.JavaMail...."
1,Date: Mon,14 May 2001 16:39:00 -0700 (PDT)
2,From: phillip.allen@enron.com,
3,To: tim.belden@enron.com,
4,Subject:,


--------------------------------  Step 2: Preprocess the Messages  --------------------------------

In [None]:
import nltk
nltk.download('stopwords')

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    tokens = text.split()  # Tokenize
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]  # Remove stopwords and stem
    return " ".join(tokens)


In [None]:
df['clean_text'] = df['text'].apply(preprocess)
df[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


--------------------------------  Step 3: Sentiment Analysis  --------------------------------

In [None]:
!pip install textblob
from textblob import TextBlob



In [None]:
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # ranges from -1 (negative) to 1 (positive)
    if polarity > 0:
        return "positive"
    elif polarity < 0:
        return "negative"
    else:
        return "neutral"

# Apply to cleaned messages
df['sentiment'] = df['clean_text'].apply(get_sentiment)
df[['text', 'sentiment']].head()


Unnamed: 0,text,sentiment
0,"Go until jurong point, crazy.. Available only ...",positive
1,Ok lar... Joking wif u oni...,positive
2,Free entry in 2 a wkly comp to win FA Cup fina...,positive
3,U dun say so early hor... U c already then say...,neutral
4,"Nah I don't think he goes to usf, he lives aro...",positive


--------------------------------  Step 4: Building a Scam Detection Model  --------------------------------

In [None]:
df['label'] = df['label'].map({'spam': 1, 'ham': 0})


In [None]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.967713004484305
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



--------------------------------  Step 5: Real-Time Message Prediction  --------------------------------

In [None]:
def predict_message(message):
    # Preprocess
    clean = preprocess(message)
    # Convert to TF-IDF
    vect_message = vectorizer.transform([clean])
    # Predict
    prediction = model.predict(vect_message)[0]
    return "Scam" if prediction == 1 else "Not Scam"


In [None]:
test_msg = "Congratulations! You've won a free honeymoon trip. Click here to claim now."
print("Prediction:", predict_message(test_msg))


Prediction: Scam
