**5.Implement bagging method to train machine learning on SMS spam detection.**
```
Dataset: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
```



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Getting the data from GitHub
github_csv_file = 'https://raw.githubusercontent.com/Rk-Pudasaini/Applied_Machine_Learning/main/Datasets/spam.csv'

# Read the CSV file from GitHub into a DataFrame
df = pd.read_csv(github_csv_file, encoding='latin1')

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
df.shape

(5572, 5)

In [3]:
# Drop the last three columns
df = df.iloc[:, :-3]
df.head()


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:

#The dataset has 'v1' for labels and 'v2' for SMS text
X = df['v2']
y = df['v1']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Define the base classifier (Decision Tree)
base_classifier = DecisionTreeClassifier(random_state=42)

# Define the Bagging Classifier with Decision Trees
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=50, random_state=42)

# Train the Bagging Classifier
bagging_classifier.fit(X_train_vectorized, y_train)

# Make predictions on the test set
y_pred = bagging_classifier.predict(X_test_vectorized)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Accuracy: 0.9722

Confusion Matrix:
[[955  10]
 [ 21 129]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      0.99      0.98       965
        spam       0.93      0.86      0.89       150

    accuracy                           0.97      1115
   macro avg       0.95      0.92      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [5]:
# Assuming you have a new SMS message
new_sms = ["Congratulations! You've won a special prize. Click the link to claim."]

# Vectorize the new instance using the same CountVectorizer
new_sms_vectorized = vectorizer.transform(new_sms)

# Use the bagging classifier to make predictions
prediction = bagging_classifier.predict(new_sms_vectorized)

# Display the prediction
print("Prediction:", prediction[0])


Prediction: spam
