In [2]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

Loading the dataset using pandas library

In [3]:
df=pd.read_csv("spam.csv",encoding="ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


preprocessing

'info()' in pandas provides a concise summary of a DataFrame, including information on the data types, non-null values, and memory usage.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


'describe()' in pandas generates descriptive statistics of a DataFrame, such as count, mean, standard deviation, minimum, and maximum values.

In [5]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


'isnull().sum()' in pandas returns the count of missing values for each column in a DataFrame.

In [6]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [7]:
df.isnull()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,False,False,True,True,True
1,False,False,True,True,True
2,False,False,True,True,True
3,False,False,True,True,True
4,False,False,True,True,True
...,...,...,...,...,...
5567,False,False,True,True,True
5568,False,False,True,True,True
5569,False,False,True,True,True
5570,False,False,True,True,True


In [8]:
#dropping of unwanted columns
df.drop(columns=df[['Unnamed: 2','Unnamed: 3','Unnamed: 4']],axis=1,inplace=True)

In [9]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


LabelEncoder converts categorical to numeric format.

In [10]:
from sklearn.preprocessing import LabelEncoder
lab=LabelEncoder()
df['v1']=lab.fit_transform(df['v1'])
df['v1'].unique()

array([0, 1])

In [11]:
y=df['v1']
x=df['v2']
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object


In [12]:
#splitting of train and test data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

TfidfVectorizer() initializes the TF-IDF vectorizer, and fit_transform(corpus) fits it to the provided corpus (a collection of text documents) and transforms the documents into a sparse matrix of TF-IDF features.

In [13]:
x_train = x_train.astype('str')
x_test = x_test.astype('str')    

feature_vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_transformed = feature_vectorizer.fit_transform(x_train)
X_test_transformed = feature_vectorizer.transform(x_test)
y_train = y_train.astype('int')  
y_test = y_test.astype('int')    

Model creation and Evaluation

In [14]:
#using naivebayes classifier
classifier = GaussianNB()
classifier.fit(X_train_transformed.toarray(), y_train)
y_pred = classifier.predict(X_test_transformed.toarray())
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.884304932735426


In [17]:
#using Support Vector Machine
from sklearn import svm
model=svm.SVC(kernel='linear')
model.fit(X_train_transformed.toarray(), y_train)
y_pred = model.predict(X_test_transformed.toarray())

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.979372197309417


In [18]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[961   4]
 [ 19 131]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

