In [7]:
import re
import pandas as pd
import numpy as np
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib
import os


In [2]:
# Create project directory structure if it doesn't exist
project_dir = "21-gradio_sms_text_classification"
if not os.path.exists(project_dir):
    os.makedirs(project_dir)
    os.makedirs(os.path.join(project_dir, "data"))
    os.makedirs(os.path.join(project_dir, "models"))


In [3]:
# Function to prepare the dataset
def prepare_dataset():
    # Load the dataset into a DataFrame
    # Assuming the dataset has columns 'label' and 'message'
    # If the columns are different, adjust the code accordingly
    sms_spam_df = pd.read_csv('Resources/SMSSpamCollection.csv', sep=',', names=['label', 'message'], encoding='utf-8')

    # Conduct some basic Pre-Processing:
    def preprocess_text(text):
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        return text

    # Apply preprocessing to the dataset
    sms_spam_df['processed_message'] = sms_spam_df['message'].apply(preprocess_text)

    # Convert spam/ham to binary labels 
    if 'label' in sms_spam_df.columns and sms_spam_df['label'].dtype == object:
        sms_spam_df['label'] = sms_spam_df['label'].map({'ham': 0, 'spam': 1})
    
    # Locally Save the cleaned dataset to CSV 
    sms_spam_df.to_csv(os.path.join(project_dir, "data", "sms_dataset.csv"), index=False)
    
    return sms_spam_df
# copy_sms_spam_df = prepare_dataset()
# print(copy_sms_spam_df.info())


In [4]:
# Function to train the model
def sms_classification(sms_spam_df):
    sms_spam_df = sms_spam_df.dropna(subset=['label'])
    # Split data into features and target
    X = sms_spam_df['message']
    y = sms_spam_df['label']
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # Initialize the Tfidf Vectors
    tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    # Initialize the Classifiers:
    models = {
        'Naive Bayes': MultinomialNB(),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Support Vector Machines ("SVM")': LinearSVC(random_state=42, max_iter=1000)}
    
    # Evaluate each model:
    results = {}
    best_model = None
    best_accuracy = 0
    best_classifier = None
    best_model_name = ''

    # Train the model:
    for name, model in models.items():
        model.fit(X_train_tfidf, y_train)

        # Test:
        y_pred = model.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Print results:
        print(f'{name} Accuracy: {accuracy:.2f}')
    
        # Keep track of the best model:
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = name
            best_classifier = model
    
    final_sms_classifier = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('classifier', best_classifier)
    ])

    final_sms_classifier.fit(X_train, y_train)
    return final_sms_classifier

  
# Save the model
# joblib.dump(model, os.path.join(project_dir, "models", "sms_classifier.pkl"))
                    

In [9]:
# Function to classify SMS
def sms_prediction(text, model):
    # Predict if the message is spam or ham
    prediction = model.predict([text])[0]
    # probs = model.predict_proba([text])[0]
    
    # Return the result
    if prediction == 1:
        return f"The text message: '{text}', is Spam (Looks Supicious)!"
    else:
        return f"The text message: '{text}', is *NOT* Spam(Maybe OK) "

In [None]:
# Main function
def main():
    # Prepare the dataset
    print("Preparing dataset...")
    sms_spam_df = prepare_dataset()
    
    # Train the model
    print("Training model...")
    model = sms_classification(sms_spam_df)
    
    # Define the Gradio interface
    def gradio_predict(message):
        return sms_prediction(message, model)
    
    # Create Gradio interface
    demo = gr.Interface(
        fn=gradio_predict,
        inputs=gr.Textbox(lines=5, placeholder="Enter a text message here..."),
        outputs=gr.Textbox(),
        title="SMS Spam Classifier",
        description="Enter a text message to classify it as spam or ham (not spam).",
        examples=[
            ['Urgent: Your package is waiting for delivery. Confirm your details'],
            ['a family member has been in an accident. Please call us back here'],
            ['Our records show you overpaid for your car insurance, click here:'],         
            ["You are a lucky winner of $5000!"],
            ["You won 2 free tickets to the Super Bowl."],
            ["You won 2 free tickets to the Super Bowl. Text us to claim your prize."],
            ["Thanks for registering. Text 4343 to receive free updates on medicare."],
            ["Congratulations! You've won a $1000 gift card. Call now to claim your prize!"],
            ["Hey, what time should we meet for dinner tonight?"],
            ["URGENT: Your bank account has been suspended. Click here to verify your information."],
            ["Don't forget to pick up milk on your way home."]
        ]
    )
    demo.launch(share=True)
if __name__ == "__main__":
    main()


Preparing dataset...
Training model...
Naive Bayes Accuracy: 0.98
Random Forest Accuracy: 0.98
Support Vector Machines ("SVM") Accuracy: 0.99
* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://f56266a9b9346a2bae.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
