In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import nltk
#stopwords are most common words in data
from nltk.corpus import stopwords
#strings module is imported to get all the constants
import string
import warnings
warnings.filterwarnings('ignore')
import re

In [None]:
df=pd.read_csv('/content/drive/MyDrive/spam.csv')
df.head()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df.Label.value_counts()
print("There are {} rows and {} columns present in the dataset".format(df.shape[0],df.shape[1]))


In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.describe()


In [None]:
df.groupby('Label').describe().T

In [None]:
df.isnull().sum()

In [None]:
#Python library for the exploratory visualization of missing data.
import missingno as msno
msno.bar(df)
plt.show()

In [None]:
sns.countplot(x='Label',data = df)
from collections import Counter
counter = Counter(df['Label'])
print(Counter)

In [None]:
duplicatedRow = df[df.duplicated()]
print(duplicatedRow[:])

In [None]:
df.drop_duplicates(inplace=True)
print("There are {} rows and {} columns present in the data set".format(df.shape[0],df.shape[1]))

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
def pre_processing(text):
  text = str(text).lower()
  spl_char_text = re.sub(r'[^a-z]',' ',text) #removing special characters and numbers
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc= ''.join(nopunc)
  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
  return clean_words


In [None]:
df['EmailText'].head().apply(pre_processing)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(analyzer = pre_processing).fit_transform(df['EmailText'],df['Label'])
df1=pd.get_dummies(df,columns=['Label'],drop_first=True)
df1


In [None]:
bow.shape

In [50]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(bow,df1["Label_spam"],test_size=0.2)

In [52]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [None]:
tuned_parameters = {'kernel': ['rbf','linear'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}
model = GridSearchCV(svm.SVC(),tuned_parameters)
model.fit(x_train,y_train)
print(model.best_params_)

In [None]:
model1=svm.SVC(C=100,gamma=0.001,kernel='rbf',probability=True)
model1.fit(x_train,y_train)


In [None]:
y_pred=model1.predict(x_test)
y_pred_proba=model1.predict_proba(x_test)[:,1]
y_pred_proba

In [67]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import cohen_kappa_score,roc_auc_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.metrics import log_loss
def classification_metric(y_test,y_pred,y_prob,label,n=1,verbose = False):
  cm = confusion_matrix(y_test,y_pred)
  row_sum = cm.sum(axis=0)
  cm=np.append(cm,row_sum.reshape(1,-1),axis=0)
  col_sum = cm.sum(axis=1)
  cm=np.append(cm,col_sum.reshape(-1,1),axis=1)
  labels=label+['Total']
  plt.figure(figsize=(10,6))
  sns.heatmap(cm,annot=True,cmap='summer',fmt='0.2f',xticklabels=labels,yticklabels=labels,linewidths=3,cbar=None)
  plt.xlabel('Predicted Values')
  plt.ylabel('Actual Values')
  plt.title('Confusion Matrix')
  plt.show()
  print('*'*30+'Classification Report'+'*'*30+'\n\n')
  cr = classification_report(y_test,y_pred)
  print(cr)
  print('\n'+'*'*36+'kappa Score'+'*'*36+'\n\n')
  kappa = cohen_kappa_score(y_test,y_pred)
  print('Kappa score =',kappa)
  print('\n'+'*'*30+'Area Under Curve Score'+'*'*30+'\n\n')
  roc_a=roc_auc_score(y_test,y_pred)
  print('AUC Score=',roc_a)
  plt.figure(figsize=(8,5))
  fpr,tpr,thresh = roc_curve(y_test,y_prob)
  plt.plot(fpr,tpr,'r')
  print('Number of probabilities to build ROC = ',len(fpr))
  if verbose == True:
    for i in range(len(thresh)):
      if i%n==0:
        plt.text(fpr[i],tpr[i],'%0.2f'%thresh[i])
        plt.plot(fpr[i],tpr[i],'v')
        plt.xlabel('False positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(['AUC = {}'.format(roc_a)])
        plt.plot([0,1],[0,1],'b--',linewidth=2.0)
        plt.grid()
        plt.show()
  class threshold():
    def __init__(self):
      self.th = 0.5
    def predict_threshold(self,y):
      if y>=self.th:
        return 1
      else:
        return 0

In [None]:
classification_metric(y_test,y_pred,y_pred_proba,['ham','spam'],n=10,verbose=True)
