In [59]:
# Import pandas
import pandas as pd
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

# Import Gradio
import gradio as gr

In [60]:
# Load the dataset into a DataFrame
sms_text_df = pd.read_csv('Resources/SMSSpamCollection.csv', sep='\t', names=['label', 'text_message'])
sms_text_df.head()

Unnamed: 0,label,text_message
0,"label,text_message",
1,"ham,""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...""",
2,"ham,Ok lar... Joking wif u oni...",
3,"spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
4,"ham,U dun say so early hor... U c already then say...",


In [61]:
# Check for missing values. 
sms_text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5575 entries, 0 to 5574
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label         5575 non-null   object
 1   text_message  2 non-null      object
dtypes: object(2)
memory usage: 87.2+ KB


In [62]:
#  Get the number of "ham" and "spam" from the "label" column:
sms_text_df['label'].value_counts()

label
ham,"Sorry, I'll call later"                                                                                                                                                                 30
ham,I cant pick the phone right now. Pls send a message                                                                                                                                      12
ham,Ok...                                                                                                                                                                                    10
ham,Your opinion about me? 1. Over 2. Jada 3. Kusruthi 4. Lovable 5. Silent 6. Spl character 7. Not matured 8. Stylish 9. Simple Pls reply..                                                  4
ham,"Wen ur lovable bcums angry wid u, dnt take it seriously.. Coz being angry is d most childish n true way of showing deep affection, care n luv!.. kettoda manda... Have nice day da."     4
                                  

In [63]:
# Fill NaN values
sms_text_df = sms_text_df.fillna('')
sms_text_df.head()

Unnamed: 0,label,text_message
0,"label,text_message",
1,"ham,""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...""",
2,"ham,Ok lar... Joking wif u oni...",
3,"spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
4,"ham,U dun say so early hor... U c already then say...",


In [64]:
# Check for Missing Values
print(sms_text_df.isnull().sum())

label           0
text_message    0
dtype: int64


In [65]:
def sms_classification(sms_text_df):
    """
    Perform SMS classification using a pipeline with TF-IDF vectorization and Linear Support Vector Classification.

    Parameters:
    - sms_text_df (pd.DataFrame): DataFrame containing 'text_message' and 'label' columns for SMS classification.

    Returns:
    - text_clf (Pipeline): Fitted pipeline model for SMS classification.
    
    """
    # Set the features variable to the text message column.
    features = sms_text_df['text_message']
    
    # Set the target variable to the "label" column.
    target = sms_text_df['label']

    # Split data into training and testing and set the test_size = 33%
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)

    # Build a pipeline to transform the test set to compare to the training set.
    text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', LinearSVC())
    ])

    # Fit the model to the transformed training data and return model.
    text_clf.fit(X_train, y_train)
    return text_clf 

In [66]:
# Call the sms_classification function with the DataFrame and set the result to the "text_clf" variable
text_clf = sms_classification(sms_text_df)



In [67]:
# Create a function called `sms_prediction` that takes in the SMS text and predicts the whether the text is "not spam" or "spam". 
# The function should return the SMS message, and say whether the text is "not spam" or "spam".
def sms_prediction(text):
    """
    Predict the spam/ham classification of a given text message using a pre-trained model.

    Parameters:
    - text (str): The text message to be classified.

    Returns:
    - str: A message indicating whether the text message is classified as spam or not.
   
    """
    # Create a variable that will hold the prediction of a new text.
    prediction = text_clf.predict([text])[0]
    
    # Using a conditional if the prediction is "ham" return the message:
    # f'The text message: "{text}", is not spam.' Else, return f'The text message: "{text}", is spam.'
    if prediction == 'ham':
        return f'The text message: "{text}", is not spam.'
    else:
        return f'The text message: "{text}", is spam.'

In [68]:
# Create a sms_app that takes a textbox for the inputs and has a textbox for the output.  
# Povide labels for each textbox. 
sms_app = gr.Interface(
    fn=sms_prediction,
    inputs=gr.Textbox(label="Enter your SMS text here"),
    outputs=gr.Textbox(label="Prediction"),
)
    
# Launch the app.
sms_app.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




## Test the following text messages. 

---

1. You are a lucky winner of $5000!
2. You won 2 free tickets to the Super Bowl.
3. You won 2 free tickets to the Super Bowl text us to claim your prize.
4. Thanks for registering. Text 4343 to receive free updates on medicare.