In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [6]:
data = pd.read_csv("spam.csv", encoding='latin-1')

In [7]:
df = data.copy()

In [8]:
df.shape
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [9]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.columns = ['cat', 'text']

In [11]:
df['cat'] = df['cat'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,cat,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df['cat'].value_counts()

0    4825
1     747
Name: cat, dtype: int64

In [13]:
df = df.drop_duplicates()

In [14]:
df.reset_index(inplace = True, drop = True)

In [15]:
x = df['text']
y = df['cat'].astype(int)

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=21)

In [17]:
feat = TfidfVectorizer()
x_train = feat.fit_transform(x_train)
x_test = feat.transform(x_test)

In [18]:
model = LogisticRegression()
model.fit(x_train, y_train)

In [19]:
y_pred = model.predict(x_test)

In [20]:
print('Accuracy: ', round(accuracy_score(y_test, y_pred)*100, 2),'%')

print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))

print('Classification report: \n', classification_report(y_test, y_pred))

Accuracy:  95.84 %
Confusion matrix: 
 [[905   3]
 [ 40  86]]
Classification report: 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       908
           1       0.97      0.68      0.80       126

    accuracy                           0.96      1034
   macro avg       0.96      0.84      0.89      1034
weighted avg       0.96      0.96      0.96      1034

