In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Read the CSV file into a DataFrame
df = pd.read_csv("spam.csv",encoding='latin')
print(df)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=42)

# Create a CountVectorizer to convert the text data into numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = classifier.predict(X_test_vectorized)

# Add predicted labels to the DataFrame
df_test = pd.DataFrame({'Text': X_test, 'Actual': y_test, 'Predicted': predictions})

# Display the DataFrame with actual and predicted labels
print(df_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

        v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will Ì_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN  


In [None]:
!pip install kaggle




In [None]:
import os
from google.colab import files

uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d uciml/sms-spam-collection-dataset

Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 107MB/s]


In [None]:
!unzip sms-spam-collection-dataset.zip

Archive:  sms-spam-collection-dataset.zip
  inflating: spam.csv                


In [None]:
import pandas as pd

# List of encodings to try
encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']

# Try reading the CSV file using different encodings until successful
for encoding in encodings:
    try:
        df = pd.read_csv("spam.csv", encoding=encoding)
        print(f"Successfully read the file using encoding: {encoding}")
        break  # Break the loop if reading is successful
    except UnicodeDecodeError:
        print(f"Failed to read using encoding: {encoding}. Trying next encoding...")


Failed to read using encoding: utf-8. Trying next encoding...
Successfully read the file using encoding: latin1


In [None]:
print(df.isnull().sum())

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64


In [None]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Text preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(str(text).lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(filtered_tokens)

# Apply text preprocessing to 'v2' column
df['processed_text'] = df['v2'].apply(preprocess_text)

# Feature extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['processed_text'])
y = df['v1']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.97

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       965
        spam       0.96      0.83      0.89       150

    accuracy                           0.97      1115
   macro avg       0.97      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
[[960   5]
 [ 25 125]]


In [None]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...",,,,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,,,,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entry wkly comp win fa cup final tkts may...
3,ham,U dun say so early hor... U c already then say...,,,,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,nah think go usf life around though
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,,time tried contact u pound prize claim easy ca...
5568,ham,Will Ì_ b going to esplanade fr home?,,,,b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",,,,pity mood suggestion
5570,ham,The guy did some bitching but I acted like i'd...,,,,guy bitching acted like interested buying some...
