# Classification with Naive Bayes
Create a classification model with Naive Bayes on sklearn with the titanic dataset to predict if a person survived the crash given features.

*   **survived** column is a predicted attribute (class).
*   Remove **alive** and **deck** columns from the dataframe.
*   Remove rows where values in some columns are missing.
*   encode **embarked**, **embark_town** with one-hot encoding
*   encode class, sex, who,adult_male, alone with indexed based encoding/label encoding [link text](https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd)
* Train a Naive Bayes model with sklearn with 70% training : 30% testing
* Report the performance of the model, including accuracy, recall, precision, and F1-score


In [28]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split

titanic_df = sns.load_dataset("titanic")
print(titanic_df)
print(titanic_df.info())

     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   First   
888         0       3  female   NaN      1      2  23.4500        S   Third   
889         1       1    male  26.0      0      0  30.0000        C   First   
890         0       3    male  32.0      0      0   7.7500        Q   Third   

       who  adult_male deck  embark_town alive  alo

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

df = titanic_df.copy()
df.drop(['alive', 'deck'], axis=1, inplace=True)

df.dropna(subset=['age', 'embarked', 'embark_town'], inplace=True)


df = pd.get_dummies(df, columns=['embarked', 'embark_town'], drop_first=True)

label_encoders = {}
for column in ['class', 'sex', 'who', 'adult_male', 'alone']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

X = df.drop('survived', axis=1)
y = df['survived']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, recall, precision, f1
######


(0.7523364485981309, 0.7125, 0.6551724137931034, 0.6826347305389221)

In [30]:
from sklearn.naive_bayes import MultinomialNB
df2 = titanic_df.copy()
df2.drop(['alive', 'deck'], axis=1, inplace=True)

df2.dropna(subset=['age', 'embarked', 'embark_town'], inplace=True)


df2 = pd.get_dummies(df2, columns=['embarked', 'embark_town'], drop_first=True)

label_encoders = {}
for column in ['class', 'sex', 'who', 'adult_male', 'alone']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

X = df.drop('survived', axis=1)
y = df['survived']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, recall, precision, f1

(0.6962616822429907, 0.5125, 0.6119402985074627, 0.5578231292517006)

In [31]:
from sklearn.naive_bayes import ComplementNB
df3 = titanic_df.copy()
df3.drop(['alive', 'deck'], axis=1, inplace=True)
df3.dropna(subset=['age', 'embarked', 'embark_town'], inplace=True)
df3 = pd.get_dummies(df3, columns=['embarked', 'embark_town'], drop_first=True)

label_encoders = {}
for column in ['class', 'sex', 'who', 'adult_male', 'alone']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

X = df.drop('survived', axis=1)
y = df['survived']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

model = ComplementNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, recall, precision, f1

(0.7009345794392523, 0.525, 0.6176470588235294, 0.5675675675675677)