In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Sample dataset


# Convert to DataFrame
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df.columns = df.columns.str.strip()
# 2. Text preprocessing
# Convert text to lowercase
df['v2'] = df['v2'].str.lower()

# Split the dataset into training and testing sets
X = df['v2']
y = df['v1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Feature extraction using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 4. Create individual classifiers
nb_classifier = MultinomialNB()
svm_classifier = SVC(probability=True)
log_reg_classifier = LogisticRegression()
rf_classifier = RandomForestClassifier(n_estimators=100)

# 5. Combine classifiers into an ensemble model using Voting Classifier
ensemble_model = VotingClassifier(estimators=[
    ('nb', nb_classifier),
    ('svm', svm_classifier),
    ('log_reg', log_reg_classifier),
    ('rf', rf_classifier)
], voting='hard')

# 6. Train the ensemble model
ensemble_model.fit(X_train_tfidf, y_train)

# 7. Evaluate the model on the test data
y_pred = ensemble_model.predict(X_test_tfidf)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9748803827751196
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1453
        spam       0.99      0.81      0.89       219

    accuracy                           0.97      1672
   macro avg       0.98      0.91      0.94      1672
weighted avg       0.98      0.97      0.97      1672

