<a href="https://colab.research.google.com/github/Vinithpr2004/CODSOFT/blob/main/codsoft_task4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [18]:
#load the datadset
df = pd.read_csv('/content/spam.csv', encoding='latin-1')

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5169 entries, 0 to 5571
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5169 non-null   object
 1   v2          5169 non-null   object
 2   Unnamed: 2  43 non-null     object
 3   Unnamed: 3  10 non-null     object
 4   Unnamed: 4  5 non-null      object
 5   label       5169 non-null   object
dtypes: object(6)
memory usage: 282.7+ KB


In [33]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,label
0,ham,"Go until jurong point, crazy.. Available only ...",,,,ham
1,ham,Ok lar... Joking wif u oni...,,,,ham
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,spam
3,ham,U dun say so early hor... U c already then say...,,,,ham
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,ham


In [29]:
#preprocessing step

df.drop_duplicates(inplace=True)
df['label'] = df['v1'].map({'ham': 'ham', 'spam': 'spam'})
x = df['v2']
y = df['label']

In [19]:
#splitting df into train & test datasets

x_train, x_test, y_train, y_test = train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=42)


In [20]:
#TF-IDF Vectorization
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [21]:
#building & training SVM model

model = SVC(kernel='linear')
model.fit(x_train_tfidf, y_train)

In [22]:
#prediction
y_pred = model.predict(x_test_tfidf)

In [23]:
#evaluation

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


In [51]:
#Classification report for SVM
cl_report_svm = classification_report(y_test, y_pred, target_names=['Valid SMS', 'Spam SMS'])

In [52]:
# SVM Progress bar
progress_bar_svm = tqdm(total=50, position=0, leave=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [53]:
# Simulate progress updates for SVM
for i in range(10, 51, 10):
    progress_bar_svm.update(10)
    progress_bar_svm.set_description(f'SVM Progress: {i}%')

SVM Progress: 50%: 100%|██████████| 50/50 [00:18<00:00,  1.86s/it]

In [54]:
progress_bar_svm.close()

SVM Progress: 50%: 100%|██████████| 50/50 [00:32<00:00,  1.56it/s]


In [55]:
print(f"Accuracy: {accuracy}")

Accuracy: 0.9829596412556054


In [56]:
print(f"Confusion Matrix:\n{conf_matrix}")

Confusion Matrix:
[[963   2]
 [ 17 133]]


In [57]:
print(f"Classification Report:\n{classification_rep}")
print(cl_report_svm)

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

              precision    recall  f1-score   support

   Valid SMS       0.98      1.00      0.99       965
    Spam SMS       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



Naive Bayes


In [35]:
n_b_classifier = MultinomialNB()

In [36]:
#training Naive Bayes Classifier
n_b_classifier.fit(x_train_tfidf, y_train)

In [38]:
#Naive Bayes Accuracy

accuracy_nb = accuracy_score(y_test, y_pred)


In [40]:
#Classification report for Naive Bayes
cl_report_nb = classification_report(y_test, y_pred, target_names=['Valid SMS', 'Spam SMS'])

In [45]:
#Creating progress bar

progress_bar_nb = tqdm(total=50, position=0, leave=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [46]:
# Simulate progress updates for Naive Bayes
for i in range(10, 51, 10):
    progress_bar_nb.update(10)
    progress_bar_nb.set_description(f'Naive Bayes Progress: {i}%')

Naive Bayes Progress: 50%: 100%|██████████| 50/50 [00:39<00:00,  3.99s/it]

In [47]:
progress_bar_nb.close()

Naive Bayes Progress: 50%: 100%|██████████| 50/50 [01:10<00:00,  1.40s/it]


Conclusion in Naive Bayes


In [48]:
print('Naive Bayes Classifier:')
print(f'Accuracy: {accuracy_nb:.2f}')
print('Classification Report:')
print(cl_report_nb)


Naive Bayes Classifier:
Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

   Valid SMS       0.98      1.00      0.99       965
    Spam SMS       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



Conclusion in SVM

In [50]:
print(f"Accuracy: {accuracy}")

print(f"Confusion Matrix:\n{conf_matrix}")

print(f"Classification Report:\n{classification_rep}")
print(cl_report_svm)

Accuracy: 0.9829596412556054
Confusion Matrix:
[[963   2]
 [ 17 133]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

              precision    recall  f1-score   support

   Valid SMS       0.98      1.00      0.99       965
    Spam SMS       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

