<a href="https://colab.research.google.com/github/Sumaiya379/AI-and-ML/blob/main/Naive_Manual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
df = pd.read_table('SMSSpamCollection' ,
                  sep='\t',
                  header=None ,
                  names=['label' , 'message'] )
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

df.head()
print("\nSample before preprocessing:\n", df.head())

def preprocess_text(message):
    # Lowercase
    message = message.lower()
    # Remove punctuation
    message = "".join([char for char in message if char not in string.punctuation])
    return message

df['message'] = df['message'].apply(preprocess_text)

print("\nSample after preprocessing:\n", df['message'].head())

X_train , X_test , y_train , y_test = train_test_split(df['message'] , df['label'] , random_state=1)

print("\nOriginal dataset contains", df.shape[0], "messages")
print("Training set contains", X_train.shape[0], "messages")
print("Testing set contains", X_test.shape[0], "messages")

count_vector = CountVectorizer() # Set the variable
X_train_counts = count_vector.fit_transform(X_train)
X_test_counts = count_vector.transform(X_test)

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_counts, y_train)

predictions = naive_bayes.predict(X_test_counts)
print("\nAccuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))


Sample before preprocessing:
    label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...

Sample after preprocessing:
 0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: message, dtype: object

Original dataset contains 5572 messages
Training set contains 4179 messages
Testing set contains 1393 messages

Accuracy: 0.9870782483847811
Confusion Matrix:
 [[1205    3]
 [  15  170]]


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
