# **NaiveBayes - Text Classification**

## 1. IMDB Dataset

In [1]:
from keras.datasets import imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


## 2. Pre-Processing

### 2.1. Any data cleaning

In [4]:
word_index = imdb.get_word_index()
inverted_word_index = dict((i, word) for (word, i) in word_index.items())

def decode_sequence(s):
    return " ".join(inverted_word_index.get(i, '<UNK>') for i in s)

train_data = [decode_sequence(i) for i in x_train]
test_data = [decode_sequence(i) for i in x_test]

## 3. Build Models

### 3.1. Uni-Gram

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


unigram = CountVectorizer(ngram_range=(1, 1), stop_words='english')
unigram_train_data = unigram.fit_transform(train_data)
unigram_test_data = unigram.transform(test_data)
nb = MultinomialNB()
nb.fit(unigram_train_data, y_train)
unigram_preds = nb.predict(unigram_test_data)

### 3.2. Bi-Gram


In [7]:
bigram = CountVectorizer(ngram_range=(2, 2), stop_words='english')
bigram_train_data = bigram.fit_transform(train_data)
bigram_test_data = bigram.transform(test_data)
nb.fit(bigram_train_data, y_train)
bigram_preds = nb.predict(bigram_test_data)

### 3.3. Tri-Gram

In [9]:
trigram = CountVectorizer(ngram_range=(3, 3), stop_words='english')
trigram_train_data = trigram.fit_transform(train_data)
trigram_test_data = trigram.transform(test_data)
nb.fit(trigram_train_data, y_train)
trigram_preds = nb.predict(trigram_test_data)

## 4. Evaluate Model

In [10]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

metrics = []
for preds in [unigram_preds, bigram_preds, trigram_preds]:
    metric = {'recall': recall_score(y_test, preds), 'precision': precision_score(y_test, preds), 'f1': f1_score(y_test, preds), 'accuracy': accuracy_score(y_test, preds)}
    metrics.append(metric)

print(f'unigram scores: {metrics[0]}')
print(f'bigram scores: {metrics[1]}')
print(f'trigram scores: {metrics[2]}')


unigram scores: {'recall': 0.75792, 'precision': 0.859787639531718, 'f1': 0.8056464985756198, 'accuracy': 0.81716}
bigram scores: {'recall': 0.80376, 'precision': 0.8503597122302158, 'f1': 0.8264034546576188, 'accuracy': 0.83116}
trigram scores: {'recall': 0.68016, 'precision': 0.7476914959106499, 'f1': 0.7123287671232876, 'accuracy': 0.72532}


## Good Luck!