<a href="https://colab.research.google.com/github/RitvikVankayala/NLP/blob/main/Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,f1_score,confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud

In [None]:
!wget https://lazyprogrammer.me/course_files/spam.csv

In [49]:
# here we will use a special encoding beacuse utf donot contain some char which we are using like emojis etc so we use a special encoding

df=pd.read_csv('spam.csv',encoding='ISO-8859-1')

In [None]:
df.head()

In [51]:
# for removing the unamed columns from the csv

df=df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)

In [None]:
df.head()

In [53]:
df.columns=['labels','data']

In [None]:
df.head()

In [None]:
# for knowing the imbalence of the classes
labels=df['labels']
labels.hist()

In [56]:
# creating a new columns which consider spam as 1 and ham as 0

df['b_labels']=labels.map({"spam":1 , "ham":0})
Y=df['b_labels'].to_numpy()
# print(Y[:5])


In [57]:
df_train,df_test,Ytrain,Ytest=train_test_split(df['data'],Y,test_size=0.33)

In [58]:
# fitting the train and the test data

# featurizer=TfidfVectorizer(decode_error='ignore')
featurizer=CountVectorizer(decode_error='ignore')
Xtrain=featurizer.fit_transform(df_train)
Xtest=featurizer.transform(df_test)

In [None]:
Xtrain

In [None]:
# predicting the accuracy

model=MultinomialNB()
model.fit(Xtrain,Ytrain)

print("accuracy :",model.score(Xtrain,Ytrain))
print("accuracy :",model.score(Xtest,Ytest))

In [None]:
Ptrain=model.predict(Xtrain)
Ptest=model.predict(Xtest)

print("f1_score :",f1_score(Ytrain,Ptrain))
print("f1_score :",f1_score(Ytest,Ptest))

In [62]:
# try doing this without error

# prob_train=model.predict_proba(Xtrain[:,1])
# prob_test=model.predict_proba(Xtest[:,1])

# print("test_AUC :",roc_auc_score(Ytrain,prob_train))
# print("test_AUC :",roc_auc_score(Ytest,prob_test))

In [None]:
cm=confusion_matrix(Ytrain,Ptrain)
cm

In [None]:
def plot_cm(cm):

  classes=['ham','spam']
  df_cm=pd.DataFrame(cm,index=classes,columns=classes)
  ax=sn.heatmap(df_cm,annot=True,fmt='g')
  ax.set_xlabel("Predicted")
  ax.set_ylabel

plot_cm(cm)

In [None]:
cm_test=confusion_matrix(Ytest,Ptest)
plot_cm(cm_test)

In [66]:
# lets visualize the data

def visualize(label):
  words=''
  for msg in df[df['labels']==label]['data']:
    msg=msg.lower()
    words+=msg+' '
  wordcloud=WordCloud(width=600,height=400).generate(words)
  plt.imshow(wordcloud)
  plt.axis('off')
  plt.show()

In [None]:
visualize('spam')

In [None]:
visualize('ham')

In [69]:
# checking which prediction are wrong

X=featurizer.transform(df['data'])
df['predictions']=model.predict(X)

In [None]:
# data that should be spam but predicted as ham

not_spam=df[df['predictions']==0 & (df['b_labels']==1)]['data']
for msg in not_spam:
  print(msg)

In [None]:
# data that should be ham but predicted as spam

not_ham=df[df['predictions']==1 & (df['b_labels']==0)]['data']
for msg in not_ham:
  print(msg)