In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [35]:
# Transforming CSV file into Pandas DataFrame
data = pd.read_csv(r'C:\Users\RISHABH\Desktop\spam_or_not_spam.csv')

# Check the structure of the dataset
print(data.head())


                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0


In [40]:
# Labelling 'text' and 'label' columns
data['email'].fillna('', inplace=True)
X = data['email']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['email'], data['label'], test_size=0.2, random_state=42)


In [43]:
#Feature Extraction
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train.values.astype('U'))  # Convert to Unicode
X_test_vectorized = vectorizer.transform(X_test.values.astype('U'))


In [44]:
# Use a simple model like Multinomial Naive Bayes
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

In [47]:
# Making predictions and evaluate
y_pred = model.predict(X_test_vectorized)

In [48]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Accuracy: 0.9916666666666667
Confusion Matrix:
[[500   0]
 [  5  95]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       500
           1       1.00      0.95      0.97       100

    accuracy                           0.99       600
   macro avg       1.00      0.97      0.98       600
weighted avg       0.99      0.99      0.99       600



In [52]:
#Testing the model
new_sms = ["help wanted we are a NUMBER year old fortune NUMBER company that is growing at a tremendous rate we are looking for individuals who want to work from home this is an opportunity to make an excellent income no experience is required we will train you so if you are looking to be employed from home with a career that has vast opportunities then go URL we are looking for energetic and self motivated people if that is you than click on the link and fill out the form and one of our employement specialist will contact you to be removed from our link simple go to URL "]
new_sms_vectorized = vectorizer.transform(new_sms)
prediction = model.predict(new_sms_vectorized)
print(f"Prediction: {prediction}")


Prediction: [1]
