# Importing libraries

In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier

# Loading data

In [2]:
# Load dataset
df = pd.read_csv(r"D:\AI Engineering\Level 2\Semester 1\NLP\Section\SMSSpamCollection.csv",
                  sep='\t', header=None, names=['label', 'message'])

# Data preprocessing

### splitting data

In [3]:
# Convert labels: 'ham' -> 0, 'spam' -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split data
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Victorizing

In [4]:
# Vectorization
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Balance data using undersampling

In [5]:
# Apply Undersampling
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_vectorized, y_train)

# Train model

In [6]:
# Train Base Models
gb = RandomForestClassifier(max_depth=8, random_state=42, class_weight='balanced')
rf = RandomForestClassifier(max_depth=8, random_state=42, class_weight='balanced')

gb.fit(X_train_resampled, y_train_resampled)
rf.fit(X_train_resampled, y_train_resampled)

In [7]:
# Get probability predictions for the training set
gb_train_preds = gb.predict_proba(X_train_resampled)[:, 1]
rf_train_preds = rf.predict_proba(X_train_resampled)[:, 1]

# Stack predictions for training meta model
X_meta_train = np.column_stack((gb_train_preds, rf_train_preds))

# Train meta model
meta_model = RandomForestClassifier(max_depth=8, random_state=42, class_weight='balanced')
meta_model.fit(X_meta_train, y_train_resampled)

# Save & load model

In [8]:
# Save models
with open("spam_classifier.pkl", "wb") as model_file:
    pickle.dump((vectorizer, gb, rf, meta_model), model_file)

# Load models
with open("spam_classifier.pkl", "rb") as model_file:
    vectorizer, gb, rf, meta_model = pickle.load(model_file)

# Evaluate model

In [9]:
# Generate predictions for the test set
gb_test_preds = gb.predict_proba(X_test_vectorized)[:, 1]
rf_test_preds = rf.predict_proba(X_test_vectorized)[:, 1]

# Stack predictions for test set
X_meta_test = np.column_stack((gb_test_preds, rf_test_preds))

# Final Predictions
final_predictions = meta_model.predict(X_meta_test)

# Evaluation Metrics
accuracy = accuracy_score(y_test, final_predictions)
classification_rep = classification_report(y_test, final_predictions)
conf_matrix = confusion_matrix(y_test, final_predictions)

# Print evaluation results
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

Model Accuracy: 0.9345

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.93      0.96       965
           1       0.68      0.95      0.79       149

    accuracy                           0.93      1114
   macro avg       0.84      0.94      0.88      1114
weighted avg       0.95      0.93      0.94      1114


Confusion Matrix:
 [[900  65]
 [  8 141]]


# Making user interface

In [10]:
# Streamlit UI
st.title("\U0001F4E9 Spam Message Classifier")
st.write("Enter a message below to check if it's Spam or Ham.")

# User input
user_input = st.text_area("Type your message here:")
if st.button("Classify Message"):
    if user_input.strip():
        # Preprocess input
        msg_vectorized = vectorizer.transform([user_input])
        
        # Get probabilities from base models
        msg_gb = gb.predict_proba(msg_vectorized)[:, 1]
        msg_rf = rf.predict_proba(msg_vectorized)[:, 1]
        
        # Stack predictions
        msg_meta = np.column_stack((msg_gb, msg_rf))
        
        # Final prediction using trained meta_model
        prediction = meta_model.predict(msg_meta)[0]
        result = "Spam" if prediction == 1 else "Ham"
        
        st.success(f"**Result:** {result}")
    else:
        st.warning("Please enter a message to classify.")

2025-02-19 19:18:53.141 
  command:

    streamlit run c:\Users\RANA\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-02-19 19:18:53.147 Session state does not function when running a script without `streamlit run`
