In [31]:
# Importing the all required packages

import numpy as np
import pandas as pd

import re

import nltk

nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ECS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ECS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1. Load the Data

> importing the Text Data + Labels

In [22]:
# Loading the Data
data = [
    "Congratulations!!! You won 1 million $$$", #Spam
        "Meeting rescheduled to 3 PM tomorrow.", #Not Spam
        "Claim your free vacation now!!!", #Spam
        "Please find the attached report for review.", #Not Spam
        "Limited offer!! Get 50% discount on medicines." #Spam
]
labels = [1,0,1,0,1]

### 2. Text Cleaning
> Lowercasing, remove punctuation, remove digits, normalize spaces

In [9]:
def cleaned_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]','',text)
    text = re.sub(r'\s+',' ',text).strip()
    return text

In [10]:
cleaned_data = [cleaned_text(x) for x in data]

In [11]:
cleaned_data

['congratulations you won million',
 'meeting rescheduled to pm tomorrow',
 'claim your free vacation now',
 'please find the attached report for review',
 'limited offer get discount on medicines']

### 3. Tokenization
> Split sentences ---> words

### 4. Stopwords Removals
> Remove common words like the, is, are 

In [13]:
stop_words = set(stopwords.words('english'))
#stop_words

In [14]:
def process_text(text):
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]    
    return ''.join(tokens)

In [15]:
processed_data = [process_text(x) for x in cleaned_data]

In [16]:
processed_data

['congratulationsmillion',
 'meetingrescheduledpmtomorrow',
 'claimfreevacation',
 'pleasefindattachedreportreview',
 'limitedoffergetdiscountmedicines']

### 5. Lemmatization / Stemming (Optional)
> Convert running into run

### 6. Vectorization
> Convert Words --> numbers using the TF-DIF or CountVectorizer 

In [17]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_data)

In [32]:
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df_tfidf

Unnamed: 0,claimfreevacation,congratulationsmillion,limitedoffergetdiscountmedicines,meetingrescheduledpmtomorrow,pleasefindattachedreportreview
0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0


### 7. Tarin Model
> Logistic Regression / Naive Bayes / SVM

In [23]:
model = LogisticRegression()
model.fit(X, labels)

### 8. Predict for New Text
> Clean --> Tokenize --> transform --> predict

In [25]:
test_msg = ["Free lottery!!! Win 5 lakh now!"]
test_clean = cleaned_text(test_msg[0])
test_processed = process_text(test_clean)
test_vector = vectorizer.transform([test_processed])

print(model.predict(test_vector))  # 1 = Spam


[1]


### 9. Model Evaluation
> Accuracy, confusion matrix, precision, recall, F1-score.

In [26]:
from sklearn.metrics import classification_report

y_pred = model.predict(X)
print(classification_report(labels, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         3

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

