In [19]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin1')

# Step 1: Retain relevant columns and drop NaN values
df_clean = df[['v1', 'v2']].dropna()
df_clean.columns = ['label', 'message']  # Rename columns for clarity

# Step 2: Encode labels ('ham' -> 0, 'spam' -> 1)
df_clean['label_encoded'] = df_clean['label'].map({'ham': 0, 'spam': 1})

# Step 3: Split the data into training and testing sets
X = df_clean['message']
y = df_clean['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Convert messages to numerical features using CountVectorizer (Bag of Words)
vectorizer = CountVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Step 5: Train the Naive Bayes classifier
naive_bayes_model = MultinomialNB(alpha=2)
naive_bayes_model.fit(X_train_vectorized, y_train)

# Step 6: Make predictions on the test set
y_pred = naive_bayes_model.predict(X_test_vectorized)

# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(report)

# Step 8: Predict on a new email message
sample_message = ["New TEXTBUDDY Chat 2 horny guys in ur area 4 just 25p Free 2 receive Search postcode or at gaytextbuddy.com. "]
sample_vectorized = vectorizer.transform(sample_message)
sample_prediction = naive_bayes_model.predict(sample_vectorized)

# Output the prediction for the sample message
predicted_label = 'Spam' if sample_prediction[0] == 1 else 'Not Spam'
print(f'The message is classified as: {predicted_label}')


Accuracy: 0.9848
Confusion Matrix:
[[962   3]
 [ 14 136]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       965
           1       0.98      0.91      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

The message is classified as: Spam


In [20]:
import gradio as gr

# Define the function to classify text
def classify_text(message):
    sample_vectorized = vectorizer.transform([message])
    sample_prediction = naive_bayes_model.predict(sample_vectorized)
    predicted_label = 'Spam' if sample_prediction[0] == 1 else 'Not Spam'
    return predicted_label

# Create the Gradio interface
iface = gr.Interface(
    fn=classify_text,
    inputs=gr.Textbox(lines=2, placeholder="Enter a message..."),
    outputs="text",
    title="Spam Classifier",
    description="Enter a message to classify it as Spam or Not Spam."
)

# Launch the interface
iface.launch()

Running on local URL:  http://127.0.0.1:7871

To create a public link, set `share=True` in `launch()`.




In [21]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all')

# Create a DataFrame
df = pd.DataFrame({
    'text': newsgroups.data,
    'target': newsgroups.target
})

# Map target labels to newsgroup names
df['target'] = df['target'].map(dict(enumerate(newsgroups.target_names)))

# Save as CSV
df.to_csv('20_newsgroups.csv', index=False)

print("Dataset saved as '20_newsgroups.csv'")


Dataset saved as '20_newsgroups.csv'
