In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

# Step 1: Load and Preprocess Data
data = pd.read_csv('spam_ham_dataset.csv')  # Replace with your dataset path
text_data = data['text']
labels = data['label']

# Step 2: Text Preprocessing with TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features
X = vectorizer.fit_transform(text_data)
y = np.array(labels)

# Step 3: Handle Class Imbalance using Random Oversampling
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 5: Build the Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 6: Train the Model
model.fit(X_train, y_train)

# Step 7: Evaluate the Model on Test Data
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.2f}')

# Step 8: Generate a Classification Report
report = classification_report(y_test, y_pred)
print(report)

# Step 9: Plot the Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title('Confusion Matrix')
plt.colorbar()
plt.xticks([0, 1], ['Ham', 'Spam'])
plt.yticks([0, 1], ['Ham', 'Spam'])
plt.ylabel('True')
plt.xlabel('Predicted')
for i in range(2):
    for j in range(2):
        plt.text(j, i, str(conf_matrix[i, j]), ha='center', va='center', color='white', fontsize=16)
plt.show()

# Step 10: Predict for Sample Text
sample_text = ["Get a free iPhone now!"]
sample_text_vectorized = vectorizer.transform(sample_text)
sample_prediction = model.predict(sample_text_vectorized)

if sample_prediction[0] == 1:
    print("Predicted: Spam")
else:
    print("Predicted: Ham")


FileNotFoundError: ignored