In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [4]:
data = pd.read_csv('spam.csv', encoding='latin1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [5]:
data = data[['v1', 'v2']]
data.columns = ['label', 'message']
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Checking the label counts

In [6]:
data['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

## Preprocessing the Text

 #### Preprocessing is where you clean and transform your text data into a format that a machine learning model can understand.

In [7]:
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

#### A simple list of common English stop words
#### This replaces the need for the NLTK stopwords corpus download

In [13]:
common_stopwords = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
    "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
    'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
    'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were',
    'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
    'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
    'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
    'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
    'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
    've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
    "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
    'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
    "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won',
    "won't", 'wouldn', "wouldn't"
}

### Function to perform all preprocessing steps without NLTK downloads

In [15]:
def preprocess_text(text):
    # 1. Lowering
    text = text.lower()

    # 2. Replacing URLs with a placeholder <url>
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'<url>', text)

    # 3. Replacing numbers with a placeholder <NUMBER>
    text = re.sub(r'\d+', r'<NUMBER>', text)
    
    # 4. Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 5. Tokenization (using simple split)
    tokens = text.split()

    # 6. Removing stop words (using the hard-coded set)
    # The word 'not' is kept as it's a strong signal for sentiment and intent
    processed_tokens = [word for word in tokens if word not in common_stopwords or word == 'not']

    # 7. Joining tokens back into a string
    processed_text = " ".join(processed_tokens)
    
    return processed_text

### Applying the preprocessing function to the 'message' column

In [17]:
data['processed_message'] = data['message'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['processed_message'] = data['message'].apply(preprocess_text)


In [18]:
print(data[['message', 'processed_message']].head(10))

                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   
5  FreeMsg Hey there darling it's been 3 week's n...   
6  Even my brother is not like to speak with me. ...   
7  As per your request 'Melle Melle (Oru Minnamin...   
8  WINNER!! As a valued network customer you have...   
9  Had your mobile 11 months or more? U R entitle...   

                                   processed_message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry NUMBER wkly comp win fa cup final t...  
3                u dun say early hor u c already say  
4        nah dont think goes usf lives around though  
5  freemsg hey darling NUMBER weeks word back id ... 

In [20]:
# Display some processed spam examples to see the effect
spam_data = data[data['label'] == 'spam']
print("\nSome processed spam messages:")
print(spam_data[['message', 'processed_message']].head())


Some processed spam messages:
                                              message  \
2   Free entry in 2 a wkly comp to win FA Cup fina...   
5   FreeMsg Hey there darling it's been 3 week's n...   
8   WINNER!! As a valued network customer you have...   
9   Had your mobile 11 months or more? U R entitle...   
11  SIX chances to win CASH! From 100 to 20,000 po...   

                                    processed_message  
2   free entry NUMBER wkly comp win fa cup final t...  
5   freemsg hey darling NUMBER weeks word back id ...  
8   winner valued network customer selected receiv...  
9   mobile NUMBER months u r entitled update lates...  
11  six chances win cash NUMBER NUMBERNUMBER pound...  


## Feature Engineering

### TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
# Initialize the TfidfVectorizer
# We can set max_features to limit the number of words considered,
# which helps reduce the size of the feature matrix and can improve performance.
vectorizer = TfidfVectorizer(max_features=5000)

In [23]:
# Fit and transform the 'processed_message' column to create the feature matrix
X = vectorizer.fit_transform(data['processed_message'])

In [24]:
# The labels (spam or ham) remain the same
y = data['label']

In [25]:
print("Shape of the feature matrix (X):", X.shape)
print("Shape of the labels (y):", y.shape)

Shape of the feature matrix (X): (5572, 5000)
Shape of the labels (y): (5572,)


In [26]:
print("Top 20 features (words):", vectorizer.get_feature_names_out()[:20])

Top 20 features (words): ['aah' 'aathilove' 'aathiwhere' 'abi' 'ability' 'abiola' 'abj' 'able'
 'abt' 'abta' 'aburo' 'ac' 'academic' 'acc' 'accept' 'access' 'accident'
 'accidentally' 'accordingly' 'account']


## Data Splitting

In [27]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, # 'random_state' ensures the split is the same every time
    stratify=y       # 'stratify' maintains the same proportion of spam/ham in both sets
)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (4457, 5000)
Shape of X_test: (1115, 5000)


## Model Training
#### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression(max_iter=1000)

# Train the model on the training data
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


### Predictions and Evaluation

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, pos_label='spam'))
print("Recall:", recall_score(y_test, y_pred, pos_label='spam'))
print("F1-Score:", f1_score(y_test, y_pred, pos_label='spam'))

Accuracy: 0.9650224215246637
Precision: 0.9296875
Recall: 0.7986577181208053
F1-Score: 0.8592057761732852


### Training a Multinomial Naive Bayes Model

In [30]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the model
nb_model = MultinomialNB()

# Train the model on the training data
nb_model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


### Predictions and Evaluation

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test data
nb_y_pred = nb_model.predict(X_test)

# Evaluate the model's performance
print("Naive Bayes Metrics:")
print("Accuracy:", accuracy_score(y_test, nb_y_pred))
print("Precision:", precision_score(y_test, nb_y_pred, pos_label='spam'))
print("Recall:", recall_score(y_test, nb_y_pred, pos_label='spam'))
print("F1-Score:", f1_score(y_test, nb_y_pred, pos_label='spam'))

Naive Bayes Metrics:
Accuracy: 0.9721973094170404
Precision: 0.9916666666666667
Recall: 0.7986577181208053
F1-Score: 0.8847583643122676


### Tunning the Model with GridSearchCV

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, f1_score

# Assuming X_train, y_train, X_test, y_test are already defined from previous steps

# Create a custom scorer for F1-score with the correct positive label
# This is the key change to fix the error
f1_scorer = make_scorer(f1_score, pos_label='spam')

# Define the model to tune
model_to_tune = MultinomialNB()

# Define the grid of hyperparameters to search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
}

# Initialize GridSearchCV with the custom scorer
grid_search = GridSearchCV(
    estimator=model_to_tune,
    param_grid=param_grid,
    cv=5, 
    scoring=f1_scorer  # Use your custom scorer here
)

# Run the grid search on your training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best hyperparameters found:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

Best hyperparameters found: {'alpha': 0.1}
Best F1-Score: 0.9382432807496427


### Final Model Evaluation

In [34]:
# Get the best model from the grid search
best_nb_model = grid_search.best_estimator_

# Make predictions on the test data
best_y_pred = best_nb_model.predict(X_test)

# Evaluate the model's performance with the optimized hyperparameters
print("\nFinal Optimized Naive Bayes Metrics:")
print("Accuracy:", accuracy_score(y_test, best_y_pred))
print("Precision:", precision_score(y_test, best_y_pred, pos_label='spam'))
print("Recall:", recall_score(y_test, best_y_pred, pos_label='spam'))
print("F1-Score:", f1_score(y_test, best_y_pred, pos_label='spam'))


Final Optimized Naive Bayes Metrics:
Accuracy: 0.9847533632286996
Precision: 0.9714285714285714
Recall: 0.912751677852349
F1-Score: 0.9411764705882353


In [35]:
import joblib

# Assuming 'best_nb_model' is your optimized model and 'vectorizer' is your TF-IDF vectorizer
joblib.dump(best_nb_model, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# Automating the whole process

### Building a ML pipeline

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, f1_score
import string
import re
import joblib

# --- Step 1: Data Preprocessing (same as before) ---
# Your preprocessing function must be applied to the data first,
# because the TfidfVectorizer in the pipeline does not remove punctuation or handle special tokens.
df = pd.read_csv('spam.csv', encoding='latin-1')
data = df.loc[:, ['v1', 'v2']]
data.columns = ['label', 'message']

common_stopwords = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
    "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
    'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
    'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were',
    'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
    'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
    'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
    'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
    'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
    've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
    "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
    'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
    "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won',
    "won't", 'wouldn', "wouldn't"
}
def preprocess_text(text):
    text = text.lower()
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'<URL>', text)
    text = re.sub(r'\d+', r'<NUMBER>', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    processed_tokens = [word for word in tokens if word not in common_stopwords or word == 'not']
    processed_text = " ".join(processed_tokens)
    return processed_text

data['processed_message'] = data['message'].apply(preprocess_text)


# --- Step 2: Split the Raw Text Data ---
# X is the RAW TEXT from the preprocessed column, not the vectorized matrix
X = data['processed_message']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

# --- Step 3: Create and Tune the Pipeline ---
# This part is now correct
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

param_grid = {
    'tfidf__max_features': [5000, 7500, 10000],
    'classifier__alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
}

f1_scorer = make_scorer(f1_score, pos_label='spam')

grid_search_pipeline = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring=f1_scorer,
    n_jobs=-1
)

grid_search_pipeline.fit(X_train, y_train)

best_pipeline = grid_search_pipeline.best_estimator_

# --- Step 4: Save the Pipeline ---
joblib.dump(best_pipeline, 'spam_detection_pipeline.pkl')

print("Pipeline training and saving successful! No errors.")
print("Best parameters found:", grid_search_pipeline.best_params_)
print("Best F1-Score:", grid_search_pipeline.best_score_)

Pipeline training and saving successful! No errors.
Best parameters found: {'classifier__alpha': 0.1, 'tfidf__max_features': 7500}
Best F1-Score: 0.9421852706718686


# Testing the model with a new message

In [43]:
import joblib

# Load the entire saved pipeline
try:
    loaded_pipeline = joblib.load('spam_detection_pipeline.pkl')
    print("Pipeline loaded successfully!")
except FileNotFoundError:
    print("Error: The pipeline file was not found. Make sure 'spam_detection_pipeline.pkl' is in the same directory.")
    exit()

# Define new messages to test. You can change these.
test_messages = [
    "You have won a brand new car! Claim your prize now at <URL>",
    "Hey, what time are you free to grab coffee later today?",
    "URGENT: Your account has been compromised. Log in now to reset your password.",
    "Did you remember to pick up groceries?",
    "Congratulations! You've been selected as a winner in our exclusive giveaway. Click <URL> to claim!"
]

# Process and predict for each new message
for message in test_messages:
    # Make a prediction. The pipeline handles all preprocessing internally.
    prediction = loaded_pipeline.predict([message])
    
    print("-----------------------------------")
    print(f"Original Message: '{message}'")
    print(f"Prediction: {prediction[0]}")
    print("-----------------------------------")

Pipeline loaded successfully!
-----------------------------------
Original Message: 'You have won a brand new car! Claim your prize now at <URL>'
Prediction: spam
-----------------------------------
-----------------------------------
Original Message: 'Hey, what time are you free to grab coffee later today?'
Prediction: ham
-----------------------------------
-----------------------------------
Original Message: 'URGENT: Your account has been compromised. Log in now to reset your password.'
Prediction: spam
-----------------------------------
-----------------------------------
Original Message: 'Did you remember to pick up groceries?'
Prediction: ham
-----------------------------------
-----------------------------------
Original Message: 'Congratulations! You've been selected as a winner in our exclusive giveaway. Click <URL> to claim!'
Prediction: spam
-----------------------------------


# Fine-tuning the model with a new dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import string
import re
import joblib
import numpy as np # Needed for array splitting

# --- CONFIGURATION ---
NEW_DATA_FILE = 'spam_detection_dataset_5000.csv'
SAVED_MODEL_FILE = 'spam_detection_pipeline.pkl'
# The original model used a specific TfidfVectorizer max_features.
# We will use the original best parameters if available, but for retraining,
# we should ideally re-fit the pipeline with the best structure.

# --- 1. Load the New Combined Dataset ---
# Note: Your generated CSV uses 'v1' and 'v2' as column names.
try:
    new_df = pd.read_csv(NEW_DATA_FILE)
    # Ensure column names match the expected format for processing
    new_df.columns = ['label', 'message']
    print(f"Successfully loaded {len(new_df)} observations from the new dataset.")
except FileNotFoundError:
    print(f"Error: The file '{NEW_DATA_FILE}' was not found. Please verify the path.")
    exit()

# --- 2. Data Preprocessing (Must match the original logic EXACTLY) ---

common_stopwords = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
    "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
    'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
    'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were',
    'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
    'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
    'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
    'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
    'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're',
    've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
    "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
    'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
    "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won',
    "won't", 'wouldn', "wouldn't"
}

def preprocess_text(text):
    """
    Applies the same cleaning steps: lowercasing, URL/Number substitution,
    punctuation removal, and stopword filtering.
    """
    if not isinstance(text, str):
        text = str(text)
        
    text = text.lower()
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'<URL>', text)
    text = re.sub(r'\d+', r'<NUMBER>', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    # Note: The original code includes 'or word == 'not'' which is unusual for stopword removal
    # but MUST be kept for consistency.
    processed_tokens = [word for word in tokens if word not in common_stopwords or word == 'not']
    processed_text = " ".join(processed_tokens)
    return processed_text

new_df['processed_message'] = new_df['message'].apply(preprocess_text)

# Separate features (X) and target (y)
X_new = new_df['processed_message']
y_new = new_df['label']

# --- 3. Split the New Data for Proper Evaluation ---
# We'll use a small portion of the new data for final testing to verify improvement
# on the specific, challenging examples you added.
X_retrain, X_eval, y_retrain, y_eval = train_test_split(
    X_new, y_new, test_size=0.1, random_state=42, stratify=y_new
)
print(f"Retraining on {len(X_retrain)} samples. Evaluating on {len(X_eval)} samples.")

# --- 4. Model Fine-Tuning/Retraining ---
# Since you used GridSearchCV, the original model is an optimized MultinomialNB.
# The simplest and safest approach is to re-fit the *entire* optimized pipeline 
# on the new, richer dataset. This is effectively updating the model's knowledge base.

try:
    # Load the best estimator from the initial training (to get the optimized structure/params)
    old_pipeline = joblib.load(SAVED_MODEL_FILE)
    print(f"Successfully loaded the existing pipeline: {SAVED_MODEL_FILE}")

    # Use the existing pipeline's structure and parameters
    # The Naive Bayes step (MultinomialNB) naturally supports incremental learning 
    # but re-fitting the entire Pipeline ensures the TF-IDF vocabulary is also expanded 
    # to include the new tokens (like local Nigerian entities)
    
    # Retrain the entire pipeline on the new, larger dataset (old + new)
    print("Starting pipeline retraining on the combined/enriched dataset...")
    updated_pipeline = old_pipeline.fit(X_retrain, y_retrain)
    print("Pipeline retraining successful!")

    # --- 5. Evaluate the Updated Model ---
    
    # Predict on the hold-out evaluation set (X_eval)
    y_pred = updated_pipeline.predict(X_eval)
    
    print("\n--- Evaluation on NEW (Challenging) Data ---")
    print("Confusion Matrix:")
    print(confusion_matrix(y_eval, y_pred, labels=['ham', 'spam']))
    
    # The key metric here is the Recall for 'spam'
    report = classification_report(y_eval, y_pred, target_names=['ham', 'spam'], output_dict=True)
    
    print("\nClassification Report:")
    print(classification_report(y_eval, y_pred, target_names=['ham', 'spam']))
    
    spam_recall = report['spam']['recall']
    print(f"\nCRITICAL METRIC: SPAM RECALL (Sensitivity): {spam_recall:.4f}")
    
    # Check if the model has improved significantly on the challenging spam (the focus)
    if spam_recall > 0.85: # Setting a high bar for success
         print("\nSUCCESS: Spam recall is high, indicating the model is learning the new challenging patterns!")
    else:
         print("\nNOTE: Spam recall is satisfactory, but check the False Negatives (escaped spam) in the confusion matrix.")


    # --- 6. Save the Updated Pipeline ---
    joblib.dump(updated_pipeline, 'spam_detection_pipeline_V2.pkl')
    print("\nUpdated pipeline saved as 'spam_detection_pipeline_V2.pkl'.")

except FileNotFoundError:
    print(f"Error: The saved model file '{SAVED_MODEL_FILE}' was not found. Please run the original training code first or verify the path.")

Successfully loaded 5014 observations from the new dataset.
Retraining on 4512 samples. Evaluating on 502 samples.
Successfully loaded the existing pipeline: spam_detection_pipeline.pkl
Starting pipeline retraining on the combined/enriched dataset...
Pipeline retraining successful!

--- Evaluation on NEW (Challenging) Data ---
Confusion Matrix:
[[401   0]
 [  0 101]]

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       401
        spam       1.00      1.00      1.00       101

    accuracy                           1.00       502
   macro avg       1.00      1.00      1.00       502
weighted avg       1.00      1.00      1.00       502


CRITICAL METRIC: SPAM RECALL (Sensitivity): 1.0000

SUCCESS: Spam recall is high, indicating the model is learning the new challenging patterns!

Updated pipeline saved as 'spam_detection_pipeline_V2.pkl'.


# Testing the model with a new message

In [2]:
import joblib

# Load the entire saved pipeline
try:
    updated_pipeline = joblib.load('spam_detection_pipeline_V2.pkl')
    print("Pipeline loaded successfully!")
except FileNotFoundError:
    print("Error: The pipeline file was not found. Make sure 'spam_detection_pipeline.pkl' is in the same directory.")
    exit()

# Define new messages to test. You can change these.
test_messages = [
    "You have won a brand new car! Claim your prize now at <URL>",
    "Hey, what time are you free to grab coffee later today?",
    "URGENT: Your account has been compromised. Log in now to reset your password.",
    "Did you remember to pick up groceries?",
    "Congratulations! You've been selected as a winner in our exclusive giveaway. Click <URL> to claim!"
]

# Process and predict for each new message
for message in test_messages:
    # Make a prediction. The pipeline handles all preprocessing internally.
    prediction = updated_pipeline.predict([message])
    
    print("-----------------------------------")
    print(f"Original Message: '{message}'")
    print(f"Prediction: {prediction[0]}")
    print("-----------------------------------")

Pipeline loaded successfully!
-----------------------------------
Original Message: 'You have won a brand new car! Claim your prize now at <URL>'
Prediction: spam
-----------------------------------
-----------------------------------
Original Message: 'Hey, what time are you free to grab coffee later today?'
Prediction: ham
-----------------------------------
-----------------------------------
Original Message: 'URGENT: Your account has been compromised. Log in now to reset your password.'
Prediction: spam
-----------------------------------
-----------------------------------
Original Message: 'Did you remember to pick up groceries?'
Prediction: ham
-----------------------------------
-----------------------------------
Original Message: 'Congratulations! You've been selected as a winner in our exclusive giveaway. Click <URL> to claim!'
Prediction: spam
-----------------------------------


In [11]:
updated_pipeline.predict(["WIN BIG! Enter our contest for a chance to win cash prizes! Visit: lucky-winner.io"])

array(['spam'], dtype='<U4')