In [42]:
# Import pandas
import pandas as pd
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

# Import Gradio
import gradio as gr

In [43]:
# Add preprocessing to add spam indicators
def preprocess_text(text):
    text = text.lower()
    # Add explicit spam markers for specific patterns
    if any(word in text for word in ['winner', 'lucky']) and any(char in text for char in ['$', '£', '€']):
        text = "SPAM_MARKER " + text
    if 'free' in text and any(word in text for word in ['win', 'won', 'winner', 'prize']):
        text = "SPAM_MARKER " + text
    return text

In [44]:
def sms_classification(sms_text_df):
    """
    Perform SMS classification using a pipeline with TF-IDF vectorization and Linear Support Vector Classification.

    Parameters:
    - sms_text_df (pd.DataFrame): DataFrame containing 'text_message' and 'label' columns for SMS classification.

    Returns:
    - text_clf (Pipeline): Fitted pipeline model for SMS classification.

    This function takes a DataFrame with 'text_message' and 'label' columns, splits the data into
    training and testing sets, builds a pipeline with TF-IDF vectorization and Linear Support Vector
    Classification, and fits the model to the training data. 
    The fitted pipeline is returned to make future predictions.
    """    
    # Apply preprocessing to the text messages
    sms_text_df['processed_text'] = sms_text_df['text_message'].apply(preprocess_text)
    
    # Use processed text for features
    X = sms_text_df['processed_text']
    y = sms_text_df['label'] 

    # Split data into training and testing and set the test_size = 33%
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

    # Enhanced TF-IDF vectorizer with better feature extraction
    text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 3),  # Include bigrams
            min_df=1,           # Minimum document frequency
            max_df=0.95,        # Maximum document frequency
            strip_accents='unicode',
            lowercase=True,
            token_pattern=r'\b\w+\$?\w*\b'  # Modified to catch currency symbols
        )),
        ('clf', LinearSVC(
            C=0.5,                  # Decreased C for stronger regularization
            class_weight={          # Custom class weights
                'spam': 2.0,        # Increase weight for spam class
                'ham': 1.0
            },
            dual=False,
            max_iter=3000
        ))
    ])

    # Fit the model to the transformed training data and return model.
    text_clf.fit(X_train, y_train)  
    return text_clf

In [45]:
# Load the dataset into a DataFrame
sms_text_df = pd.read_csv('Resources/SMSSpamCollection.csv')
sms_text_df.head()


Unnamed: 0,label,text_message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [46]:
# Call the sms_classification function with the DataFrame and set the result to the "text_clf" variable
text_clf = sms_classification(sms_text_df)

In [47]:
# Create a function called `sms_prediction` that takes in the SMS text and predicts the whether the text is "not spam" or "spam". 
# The function should return the SMS message, and say whether the text is "not spam" or "spam".
def sms_prediction(text):
    """
    Predict the spam/ham classification of a given text message using a pre-trained model.

    Parameters:
    - text (str): The text message to be classified.

    Returns:
    - str: A message indicating whether the text message is classified as spam or not.

    This function takes a text message and a pre-trained pipeline model, then predicts the
    spam/ham classification of the text. The result is a message stating whether the text is
    classified as spam or not.
    """
    processed_text = preprocess_text(text)  # Apply same preprocessing
    
    # Create a variable that will hold the prediction of a new text.
    sms_prediction = text_clf.predict([processed_text])[0]
    # Using a conditional if the prediction is "ham" return the message:
    # f'The text message: "{text}", is not spam.' Else, return f'The text message: "{text}", is spam.'
    if sms_prediction == 'ham':
        return f'The text message: "{text}", is not spam.'
    else:
        return f'The text message: "{text}", is spam.'   

In [48]:
# Create a sms_app that takes a textbox for the inputs and has a textbox for the output.  
# Provide labels for each textbox.
sms_app = gr.Interface(
    fn=sms_prediction,
    inputs=gr.Textbox(label="What is the text message you want to test?"),
    outputs=gr.Textbox(label="Our app has determined:"),
    examples=[
        ["You are a lucky winner of $5000!"],
        ["You won 2 free tickets to the Super Bowl."],
        ["You won 2 free tickets to the Super Bowl text us to claim your prize."],
        ["Thanks for registering. Text 4343 to receive free updates on medicare."]
    ],
    title="SMS Spam Detector"
)

# Launch the app
sms_app.launch(share=True)


* Running on local URL:  http://127.0.0.1:7867

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




## Test the following text messages. 

---

1. You are a lucky winner of $5000!
2. You won 2 free tickets to the Super Bowl.
3. You won 2 free tickets to the Super Bowl text us to claim your prize.
4. Thanks for registering. Text 4343 to receive free updates on medicare.

In [51]:
# Create some random text messages. 
text_1 = """You are a lucky winner of $5000!!"""
text_2 = """You won 2 free tickets to the Super Bowl."""
text_3 = """You won 2 free tickets to the Super Bowl text us to claim your prize"""
text_4 = """Thanks for registering. Text 4343 to receive free updates on medicare"""

# Send the text messages to transform the data and predict the classification.
print(text_clf.predict([preprocess_text(text_1)]))  # Added preprocess_text
print(text_clf.predict([preprocess_text(text_2)]))  
print(text_clf.predict([preprocess_text(text_3)]))  
print(text_clf.predict([preprocess_text(text_4)]))  

['spam']
['spam']
['spam']
['spam']
