# Naive Bayes

## Spam/ham classification

In [3]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 


df = pd.read_csv("../Data/spam.csv", encoding="latin-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [10]:
df_no_nan=df.dropna(axis=1)
df_no_nan.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.columns[2:]

Index(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [12]:
# check how many rows have values in the Unnamed columns
np.sum(df[df.columns[2:].tolist()].notna())

Unnamed: 2    50
Unnamed: 3    12
Unnamed: 4     6
dtype: int64

In [11]:
print(df.shape)
# read some of the Unnamed that have values
#df.loc[df["Unnamed: 2"].notna()].iloc[:5]

(5572, 5)


In [9]:
# as it is very few rows, we remove those columns
df_no_NaN = df.dropna(axis=1)
df_no_NaN.columns = ["class", "content"]
df_no_NaN.head()

Unnamed: 0,class,content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
df = pd.get_dummies(df_no_NaN, columns = ["class"], drop_first=True)
df.head()

Unnamed: 0,content,class_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [14]:
X, y = df["content"], df["class_spam"]
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: content, dtype: object

In [16]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: class_spam, dtype: uint8

-----

## TF-IDF vector

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


tfidf_vectorizer = TfidfVectorizer(stop_words="english")
print(tfidf_vectorizer.get_stop_words())


frozenset({'please', 'along', 'through', 'whence', 'again', 'whether', 'almost', 'how', 'sometimes', 'becomes', 'while', 'last', 'anyway', 'etc', 'another', 'who', 'so', 'thick', 'now', 'anyhow', 'where', 'its', 'see', 'had', 'together', 'made', 'as', 'describe', 'take', 'whither', 'him', 'seeming', 'once', 'cannot', 'do', 'wherein', 'amongst', 'found', 'of', 'name', 'across', 'nevertheless', 'latter', 'five', 'in', 'enough', 'over', 'every', 'everyone', 'seemed', 'himself', 'themselves', 'any', 'bill', 'why', 'alone', 'neither', 'must', 'something', 'all', 'somewhere', 'are', 'when', 'were', 'his', 'among', 're', 'sometime', 'nowhere', 'fifty', 'well', 'yourselves', 'and', 'around', 'bottom', 'thereby', 'such', 'have', 'should', 'more', 'became', 'about', 'with', 'becoming', 'cant', 'other', 'myself', 'than', 'am', 'herein', 'onto', 'go', 'else', 'twelve', 'because', 'each', 'might', 'less', 'couldnt', 'un', 'down', 'whatever', 'ie', 'serious', 'besides', 'whoever', 'hers', 'empty', '

In [19]:
X_tfidf = tfidf_vectorizer.fit_transform(X)
X_tfidf


<5572x8404 sparse matrix of type '<class 'numpy.float64'>'
	with 43478 stored elements in Compressed Sparse Row format>

In [None]:
print(repr(X_tfidf))
print(X_tfidf[1])
print(f"Min value: {X_tfidf.min()}, max value: {X_tfidf.max()}")

In [None]:


X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape

In [None]:
fig, ax = plt.subplots(1,2, dpi = 100, figsize = (8,3))
sns.countplot(x = y_train, ax = ax[0])
sns.countplot(x = y_test, ax = ax[1])