In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [3]:
df.drop(columns=['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [13]:
target = df.Survived
inputs = df.drop(columns='Survived')

In [14]:
dummies = pd.get_dummies(inputs.Sex)
dummies = dummies.astype(int)
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [15]:
inputs = pd.concat([inputs, dummies], axis=1)
inputs.drop(columns='Sex', inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [16]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [17]:
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [22]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.columns[inputs.isna().any()]

Index([], dtype='object')

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=37)

In [25]:
len(X_train)

712

In [26]:
len(X_test)

179

In [27]:
from sklearn.naive_bayes import GaussianNB

In [28]:
model = GaussianNB()

In [29]:
model.fit(X_train, y_train)

In [30]:
model.score(X_test, y_test)

0.7877094972067039

In [31]:
y_predicted = model.predict(X_test[:10])
y_subtest = y_test[:10]

In [32]:
y_predicted

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [35]:
y_subtest

601    0
743    0
673    1
160    0
786    1
620    0
203    0
342    0
266    0
745    0
Name: Survived, dtype: int64

In [36]:
model.predict_proba(X_test[:10])

array([[0.99065266, 0.00934734],
       [0.98994917, 0.01005083],
       [0.9801086 , 0.0198914 ],
       [0.99052529, 0.00947471],
       [0.05825737, 0.94174263],
       [0.99048078, 0.00951922],
       [0.99012022, 0.00987978],
       [0.9794643 , 0.0205357 ],
       [0.98385621, 0.01614379],
       [0.58176056, 0.41823944]])

# Email spam detection

In [37]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [39]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [40]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2, random_state=37)

In [42]:
# convert msg into nums

from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [44]:
emails = [
    "We are pleased to inform you that you have been selected as the lucky winner of an all-expenses-paid vacation to a luxurious resort. To claim your prize, simply click on the link below and provide your personal information. Act now before this amazing offer expires!",
    "We hope this message finds you well. Here are the latest updates and announcements from our company"
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([1, 0], dtype=int64)

In [45]:
model.score(v.transform(X_test), y_test)

0.9856502242152466

In [46]:
# using pipeline

from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [47]:
clf.fit(X_train, y_train)

In [48]:
clf.score(X_test, y_test)

0.9856502242152466

In [49]:
clf.predict(emails)

array([1, 0], dtype=int64)

# Exercise

In [50]:
from sklearn.datasets import load_wine
wine = load_wine()

In [51]:
dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [52]:
wine.data[0]

array([1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03])

In [53]:
wine.feature_names[0]

'alcohol'

In [73]:
wine.target[0]

0

In [55]:
wine.target_names[0]

'class_0'

In [71]:
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [58]:
wine.target_names
# 3 categories, we have to classify into 1 of them

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [74]:
X_train, X_test, y_train, y_test = train_test_split(df, wine.target, test_size=0.2, random_state=37)

In [93]:
model = GaussianNB()

In [94]:
model.fit(X_train, y_train)

In [95]:
model.score(X_test, y_test)

0.9722222222222222

In [96]:
y_predicted = model.predict(X_test[:10])
y_subtest = y_test[:10]
print(model)

GaussianNB()


In [97]:
y_predicted

array([1, 0, 0, 2, 0, 2, 0, 1, 1, 1])

In [98]:
y_subtest

array([1, 0, 0, 2, 0, 2, 0, 1, 1, 1])

In [99]:
model = MultinomialNB()

In [100]:
model.fit(X_train, y_train)

In [101]:
model.score(X_test, y_test)

0.8055555555555556

In [102]:
y_predicted = model.predict(X_test[:10])
y_subtest = y_test[:10]
print(model)

MultinomialNB()


In [103]:
y_predicted

array([1, 0, 0, 2, 1, 2, 0, 1, 1, 1])

In [104]:
y_subtest

array([1, 0, 0, 2, 0, 2, 0, 1, 1, 1])