In [1]:
import pandas as pd
import numpy as np
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import joblib
import os


In [2]:
# Create project directory structure if it doesn't exist
project_dir = "21-gradio_sms_text_classification"
if not os.path.exists(project_dir):
    os.makedirs(project_dir)
    os.makedirs(os.path.join(project_dir, "data"))
    os.makedirs(os.path.join(project_dir, "models"))


In [None]:
# Function to prepare the dataset
def prepare_dataset():
    # Load the dataset into a DataFrame
    # Assuming the dataset has columns 'label' and 'message'
    # If the columns are different, adjust the code accordingly
    sms_spam_df = pd.read_csv('Resources/SMSSpamCollection.csv', sep=',', names=['label', 'message'], encoding='utf-8')


    # Convert spam/ham to binary labels if needed
    if 'label' in sms_spam_df.columns and sms_spam_df['label'].dtype == object:
        sms_spam_df['label'] = sms_spam_df['label'].map({'ham': 0, 'spam': 1})
    
    # Save the cleaned dataset
    sms_spam_df.to_csv(os.path.join(project_dir, "data", "sms_dataset.csv"), index=False)
    
    return sms_spam_df
# copy_sms_spam_df = prepare_dataset()
# print(copy_sms_spam_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   label    5572 non-null   float64
 1   message  5573 non-null   object 
dtypes: float64(1), object(1)
memory usage: 87.2+ KB
None


In [6]:
# Function to train the model
def sms_classification(sms_spam_df):
    # Split data into features and target
    X = sms_spam_df['message']
    y = sms_spam_df['label']
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # Create a pipeline with TF-IDF vectorizer and Naive Bayes classifier
    model = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('classifier', MultinomialNB())
    ])
    model.fit(X_train, y_train)
    return model

    # # Train the model
    # model.fit(X_train, y_train)
    
    # # Evaluate the model
    # y_pred = model.predict(X_test)
    # accuracy = accuracy_score(y_test, y_pred)
    # report = classification_report(y_test, y_pred)
    
    # # Plot confusion matrix
    # cm = confusion_matrix(y_test, y_pred)
    # plt.figure(figsize=(8, 6))
    # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
    # plt.xlabel('Predicted')
    # plt.ylabel('Actual')
    # plt.title('Confusion Matrix')
    # plt.savefig(os.path.join(project_dir, "data", "confusion_matrix.png"))
   
# Save the model
# joblib.dump(model, os.path.join(project_dir, "models", "sms_classifier.pkl"))
                    

In [12]:
# Function to classify SMS
def sms_prediction(text, model):
    # Predict if the message is spam or ham
    prediction = model.predict([text])[0]
    # probs = model.predict_proba([text])[0]
    
    # Return the result
    if prediction == 1:
        return f"The text message: '{text}', is Spam)"
    else:
        return f"The text message: '{text}', is *NOT* Spam)"

In [11]:
# Main function
def main():
    # Prepare the dataset
    print("Preparing dataset...")
    sms_spam_df = prepare_dataset()
    
    # Train the model
    print("Training model...")
    model, accuracy, report = sms_classification(sms_spam_df)
    print(f"Model accuracy: {accuracy:.2%}")
    print("Classification report:")
    print(report)
    
    # Define the Gradio interface
    def gradio_predict(message):
        return classify_sms(message, model)
    
    # Create Gradio interface
    demo = gr.Interface(
        fn=gradio_predict,
        inputs=gr.Textbox(lines=5, placeholder="Enter a text message here..."),
        outputs=gr.Textbox(),
        title="SMS Spam Classifier",
        description="Enter a text message to classify it as spam or ham (not spam).",
        examples=[
            ["Congratulations! You've won a $1000 gift card. Call now to claim your prize!"],
            ["Hey, what time should we meet for dinner tonight?"],
            ["URGENT: Your bank account has been suspended. Click here to verify your information."],
            ["Don't forget to pick up milk on your way home."]
        ]
    )

    demo.launch()
    if __name__ == "__main__":
        main()
