In [1]:
#Import Library
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

## Data Understanding

In [2]:
df = pd.read_csv("sms_spam_indo.csv")

In [3]:
df.head()

Unnamed: 0,Kategori,Pesan
0,spam,Plg Yth: Simcard anda mendptkan bonus poin plu...
1,ham,Iya ih ko sedih sih gtau kapan lg ke bandung :(
2,ham,Kalau mau bikin model/controller mending per a...
3,ham,Selamat nama1. Semoga selalu menempuh hidup ya...
4,spam,Tingkatkan nilai isi ulang Anda selanjutnya mi...


Dataset diatas berisi kolom-kolom sebagai berikut:
1. Kategori : kolom ini mengklasifikasikan jenis pesan, dengan dua nilai yang mungkin: "spam" dan "ham"
2. Pesan : Kolom ini berisi teks pesan sebenarnya.

## Data Preparation

In [4]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import string

# Fungsi untuk membersihkan teks
def clean_text(text):
    text = text.lower()  # Ubah teks ke huruf kecil
    text = re.sub(f"[{string.punctuation}]", "", text)  # Menghapus tanda baca
    text = re.sub("\d+", "", text)  # Menghapus angka
    text = text.strip()
    return text

# Terapkan fungsi pembersihan pada kolom Pesan
df['Pesan'] = df['Pesan'].apply(clean_text)

## Modeling

In [5]:
# Memisahkan fitur dan label
X = df['Pesan']
y = df['Kategori']

# Memisahkan data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mengubah teks menjadi vektor TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Melatih model Naive Bayes
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


## Evaluation

In [6]:
# Prediksi pada data uji
y_pred = model.predict(X_test_tfidf)

# Evaluasi model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[109   2]
 [  5 113]]
              precision    recall  f1-score   support

         ham       0.96      0.98      0.97       111
        spam       0.98      0.96      0.97       118

    accuracy                           0.97       229
   macro avg       0.97      0.97      0.97       229
weighted avg       0.97      0.97      0.97       229



## Deployment dengan Streamlit

In [7]:
import streamlit as st
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Function to clean the text input
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower().strip()       # Convert to lowercase
    return text

# Load the trained model and TF-IDF vectorizer
model = joblib.load('spam_classifier_model.pkl')
vectorizer = joblib.load('Classification Spam or Ham.pkl')

# Streamlit interface
st.title('SMS Spam Classifier')

# Text input box
message = st.text_area('Enter the SMS message')

# Clean and vectorize the input message
if message:
    cleaned_message = clean_text(message)
    message_tfidf = vectorizer.transform([cleaned_message])

    # Make prediction
    prediction = model.predict(message_tfidf)

    # Display the result
    if prediction == 'spam':
        st.error('This message is classified as SPAM.')
    else:
        st.success('This message is classified as HAM.')

# Save the model and vectorizer as .pkl files
joblib.dump(model, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'Classification Spam or Ham.pkl')


2024-09-30 02:10:24.181 
  command:

    streamlit run C:\Users\Redmi_PC\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-09-30 02:10:24.183 Session state does not function when running a script without `streamlit run`


['Classification Spam or Ham.pkl']

In [8]:
import joblib

# Save the trained model
joblib.dump(model, 'spam_classifier_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'Classification Spam or Ham.pkl')

['Classification Spam or Ham.pkl']

In [9]:
model = joblib.load('spam_classifier_model.pkl')
vectorizer = joblib.load('Classification Spam or Ham.pkl')