In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.datasets import load_digits
from sklearn.datasets import load_iris

set_config(print_changed_only=False)


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

%config InlineBackend.figure_format='svg'
plt.rcParams['figure.dpi']=120

pd.options.display.float_format='{:,.2f}'.format
pd.set_option('display.max_colwidth', None)


<h1 style='color:blue' align='center'>Naive Bayes</h1>

<img src='./data/naiveBayes.JPG' width=800 height=500>
<img src='./data/naiveBayes_1.JPG' width=800 height=500>

# Predict titanic survival using naive bayes

<img src='./data/naiveBayes_2.JPG' width=800 height=500>

In [2]:
df = pd.read_csv("./data/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.28
2,1,3,female,26.0,7.92
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [4]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

In [5]:
dummies=pd.get_dummies(df['Sex'])
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [6]:
inputs = pd.concat([inputs, dummies],axis='columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.28,1,0
2,3,female,26.0,7.92,1,0


In [7]:
# One column is enough to repressent male vs female

inputs.drop(['Sex','male'],axis='columns',inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.28,1
2,3,26.0,7.92,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0


In [8]:
inputs.columns[inputs.isna().any()] # inputs.isna().any() --> return column names 

Index(['Age'], dtype='object')

In [9]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.28,1
2,3,26.0,7.92,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.3)

In [11]:
model = GaussianNB()

In [12]:
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [13]:
model.score(X_test,y_test)

0.7835820895522388

In [14]:
X_test[0:10]

Unnamed: 0,Pclass,Age,Fare,female
199,2,24.0,13.0,1
351,1,29.7,35.0,0
734,2,23.0,13.0,0
166,1,29.7,55.0,1
247,2,24.0,14.5,1
724,1,27.0,53.1,0
324,3,29.7,69.55,0
413,2,29.7,0.0,0
777,3,5.0,12.47,1
193,2,3.0,26.0,0


In [15]:
y_test[0:10]

199    0
351    0
734    0
166    1
247    1
724    1
324    0
413    0
777    1
193    1
Name: Survived, dtype: int64

In [16]:
model.predict(X_test[0:10])

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 0], dtype=int64)

In [17]:
model.predict_proba(X_test[:10]) # Returns the probability of the samples for each class in the model.

array([[0.24445706, 0.75554294],
       [0.73387584, 0.26612416],
       [0.92030561, 0.07969439],
       [0.04390136, 0.95609864],
       [0.24436075, 0.75563925],
       [0.63257114, 0.36742886],
       [0.88841139, 0.11158861],
       [0.92129929, 0.07870071],
       [0.30805651, 0.69194349],
       [0.86300376, 0.13699624]])

**Calculate the score using cross validation**

In [18]:
cross_val_score(GaussianNB(), X_train, y_train, cv=5)

array([0.76      , 0.776     , 0.768     , 0.75806452, 0.79032258])

# Build email spam detector

<img src='./data/naiveBayes_3.JPG' width=800 height=500>

**NB algorithm:**

1. Gaussian NB: Because of the assumption of the normal distribution, Gaussian Naive Bayes is used in cases when all our features are continuous. For example in Iris dataset features are sepal width, petal width, sepal length, petal length. So its features can have different values in data set as width and length can vary. We can’t represent features in terms of their occurrences. This means data is continuous. Hence we use Gaussian Naive Bayes here.


2. MultiNomial NB: Its is used when we have discrete data (e.g. movie ratings ranging 1 and 5 as each rating will have certain frequency to represent). In text learning we have the count of each word to predict the class or label.


3. Bernoulli NB: It should be used for features with binary or boolean values like True/False or 0/1. It assumes that all our features are binary such that they take only two values. Means 0s can represent “word does not occur in the document” and 1s as "word occurs in the document" .

In [19]:
df = pd.read_csv("./data/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [20]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representative on FREEPHONE 0808 145 4742 between 9am-11pm as you have WON a guaranteed £1000 cash or £5000 prize!,4


In [21]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives around here though",0


In [22]:
X=df['Message']
y=df['spam']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=32)  #If train_size is also None, it will be set to 0.25.

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values) # y values have already converted to 0 or 1

In [25]:
X_train_count

<4179x7502 sparse matrix of type '<class 'numpy.int64'>'
	with 55174 stored elements in Compressed Sparse Row format>

In [26]:
y_test

2795    0
3677    0
290     0
5169    0
1253    0
       ..
2257    0
1937    0
543     0
1233    0
1150    0
Name: spam, Length: 1393, dtype: int64

In [27]:
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
len(X_train_count.toarray())

4179

In [29]:
model = MultinomialNB()
model.fit(X_train_count, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!',
    'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wa'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1, 0], dtype=int64)

In [31]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9842067480258435

### Sklearn Pipeline

```python
from sklearn.pipeline import Pipeline
```

In [32]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [33]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [34]:
clf.score(X_test, y_test)

0.9842067480258435

In [35]:
clf.predict(emails)

array([0, 1, 0], dtype=int64)

### Execise

In [36]:
from sklearn import datasets
wine = datasets.load_wine()

In [37]:
dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [38]:
wine.data[:2]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03]])

In [39]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [40]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [41]:
wine.target[0:2]

array([0, 0])

In [42]:
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df.head()
df.shape

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


(178, 13)

In [43]:
df['target'] = wine.target
df[50:70]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
50,13.05,1.73,2.04,12.4,92.0,2.72,3.27,0.17,2.91,7.2,1.12,2.91,1150.0,0
51,13.83,1.65,2.6,17.2,94.0,2.45,2.99,0.22,2.29,5.6,1.24,3.37,1265.0,0
52,13.82,1.75,2.42,14.0,111.0,3.88,3.74,0.32,1.87,7.05,1.01,3.26,1190.0,0
53,13.77,1.9,2.68,17.1,115.0,3.0,2.79,0.39,1.68,6.3,1.13,2.93,1375.0,0
54,13.74,1.67,2.25,16.4,118.0,2.6,2.9,0.21,1.62,5.85,0.92,3.2,1060.0,0
55,13.56,1.73,2.46,20.5,116.0,2.96,2.78,0.2,2.45,6.25,0.98,3.03,1120.0,0
56,14.22,1.7,2.3,16.3,118.0,3.2,3.0,0.26,2.03,6.38,0.94,3.31,970.0,0
57,13.29,1.97,2.68,16.8,102.0,3.0,3.23,0.31,1.66,6.0,1.07,2.84,1270.0,0
58,13.72,1.43,2.5,16.7,108.0,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285.0,0
59,12.37,0.94,1.36,10.6,88.0,1.98,0.57,0.28,0.42,1.95,1.05,1.82,520.0,1


In [44]:
X=wine.data
y=wine.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [45]:
model = GaussianNB() # used for continuous data with the features of normal distribution
model.fit(X_train, y_train)
model.score(X_test, y_test)

GaussianNB(priors=None, var_smoothing=1e-09)

1.0

In [46]:
mn = MultinomialNB() # used for discrete data with the features of categorial type
mn.fit(X_train, y_train)
mn.score(X_test, y_test)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

0.7777777777777778