In [1]:
import numpy as np


# Here is some example tweets and labels for both training and test
# Each tweet has one label that corresponds to {0: 'anger', 1: 'joy', 2: 'optimism', 3: 'sadness'}
train_data = ['I am worried that he felt safe', 
              'I am not happy that she is worried', 
              'She is happy', 
              'He is not worried',
              'He is safe']

test_data = ['I do not like being worried']

train_labels = np.array([3,0,1,1,2])
test_labels = np.array([0])

print(f'Training labels: {train_labels}')
print(f'Test labels: {test_labels}')

print(f'Test labels: {train_data}')

# Semantics of this label vectors:
# We have 4 unique labels [0,1,2,3]
# These two vectors consists of a label for each sentence in training (5) and test (1)

Training labels: [3 0 1 1 2]
Test labels: [0]
Test labels: ['I am worried that he felt safe', 'I am not happy that she is worried', 'She is happy', 'He is not worried', 'He is safe']


In [2]:
# To obtain features (in 2d numpy array) of this training data you can use scikit-learn CountVectorizer
# Here an example:
from sklearn.feature_extraction.text import CountVectorizer

counter = CountVectorizer()
# assign this as self.counter in the assignment code

# You can use counter.fit_transform() to count words for training data 
train_feats = counter.fit_transform(train_data).toarray()
print(f'Training features: \n{train_feats}')

# This array (dimension 2x10) shows individual word counts for each tweet, therefore 1 vector per tweet

# For Task 1 in the Assignment, you can use CountVectorizer for the test data as well
# Please be careful that you need to use ***the same counter*** for both training and test data
# Here, you can use counter.transform() for already created counter.

Training features: 
[[1 1 0 1 0 0 1 0 1 1]
 [1 0 1 0 1 1 0 1 1 1]
 [0 0 1 0 1 0 0 1 0 0]
 [0 0 0 1 1 1 0 0 0 1]
 [0 0 0 1 1 0 1 0 0 0]]


In [3]:
# This is the feature array for test data that we are supposed to get with get_features
test_feats = np.array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 1]])

print(f'Test features: \n{test_feats}')

# Semantics of this features array:
# We have 1 tweet in test data 
# Each number shows the count of unique words in the training tweets!

Test features: 
[[0 0 0 0 0 1 0 0 0 1]]


In [4]:
# For Naive Bayes implementation:
# First we need to calculate class priors from training data: 
# class priors = number of tweets for a class / total number of tweets

# number of all tweets in training data
no_of_training_sentences = train_feats.shape[0]
print(f'Total number of tweets: {no_of_training_sentences}')

# number of tweets per class
no_tweets_by_class = np.bincount(train_labels)
print(f'Number of tweets per class: \n{no_tweets_by_class}')

# You need to divide the number of tweet per class by the total number of tweets to get priors !!!
# Save this as self.priors in the assignment code
# Shape of priors should be (4,): one probability value for each class

Total number of tweets: 5
Number of tweets per class: 
[1 2 1 1]


In [5]:
# Next step for naive bayes is to calculate word likelihoods per class:
# Word likelihoods =  Number of words per class / number of total word count per class 

# We need to split training features by class
# first lets obtain our label set (unique labels from training labels)
unique_labels = np.unique(train_labels)
print(f'Unique class labels: \n{unique_labels}')

# Let's create a dictionary for each unique label with empty list 
features_by_label = [[] for c in unique_labels]

# Now we need to append all the feature vectors belong to corresponding label
for tweet, label in zip(train_feats, train_labels):
    features_by_label[label].append(tweet)   
print(f'Training features per class: \n{features_by_label}')

# Next step, accumulating the word counts per label 
feature_sum_by_label = [[] for c in unique_labels]
for label in unique_labels:
    feature_sum_by_label[label] = np.sum(features_by_label[label], axis=0)
print(f'Word counts per class: \n{feature_sum_by_label}')
    
# you also need to add 1 more count to each word to avoid 0 count (alpha smoothing)

# Later, you need to divide word counts by total number of words for each class !!!
# Save this as self.word_likelihoods in the assignment code

# Shape of resulting word_likelihoods should be (4, 10) in this example: one probability value for each word, in each class

Unique class labels: 
[0 1 2 3]
Training features per class: 
[[array([1, 0, 1, 0, 1, 1, 0, 1, 1, 1], dtype=int64)], [array([0, 0, 1, 0, 1, 0, 0, 1, 0, 0], dtype=int64), array([0, 0, 0, 1, 1, 1, 0, 0, 0, 1], dtype=int64)], [array([0, 0, 0, 1, 1, 0, 1, 0, 0, 0], dtype=int64)], [array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1], dtype=int64)]]
Word counts per class: 
[array([1, 0, 1, 0, 1, 1, 0, 1, 1, 1], dtype=int64), array([0, 0, 1, 1, 2, 1, 0, 1, 0, 1], dtype=int64), array([0, 0, 0, 1, 1, 0, 1, 0, 0, 0], dtype=int64), array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1], dtype=int64)]


In [6]:
# Let's create a random word likelihood array to simulate our code:
word_likelihoods = [np.random.rand(10) for l in unique_labels]
print(f'Word likelihood per each class: \n {word_likelihoods}')

# After you obtain class priors and word likelihoods, your training is done !!!
# Next step is to predict test labels using already calculated class priors and word likelihoods

# Here, we need to loop over each test tweets (In this example we only have 1 test tweet):
for i, test_tweet in enumerate(test_feats):
    
    # Some word may not be in your test data
    # by using numpy you can easily obtain word indexes exist (non-zero count) in your test tweet
    word_exists_idx = np.flatnonzero(test_tweet)
    print(f'word indexes exist (non-zero count) in your test tweet: \n {word_exists_idx}')
    
    # after obtain this word indexes, let's obtain tweet likelihood per class:
    likelihoods_of_tweet_by_label = [1 for l in unique_labels]
    for label in unique_labels:
            
        # Here, we can loop test feature to calculate likelihood of the tweet
        for idx in word_exists_idx:
            likelihoods_of_tweet_by_label[label] *= (word_likelihoods[label][idx] ** test_tweet[idx])
            # Second term is for words with count more than one 

    print(f'Total likelihood probabilities of this test tweet: \n {likelihoods_of_tweet_by_label}')
    
    # Here, you need to multiply this resulting tweet probabilities with the class prior for each class to obtain conditional probabilities
    # As the final step you need to select the class with maximum conditional probability
    # You can use np.argmax function for this operation !!!
    
    # Do it for each test tweet and obtain predicted classes !!!

Word likelihood per each class: 
 [array([0.45975667, 0.4624187 , 0.73438367, 0.27105114, 0.33603853,
       0.12564712, 0.86881108, 0.65509601, 0.9937313 , 0.09298962]), array([0.09347038, 0.46538082, 0.13814393, 0.3324499 , 0.21272205,
       0.31374194, 0.39387099, 0.9546204 , 0.51821175, 0.19964157]), array([0.5502145 , 0.62462802, 0.18761569, 0.31628705, 0.94929205,
       0.4244766 , 0.71161382, 0.85834696, 0.36983051, 0.84902922]), array([0.31516285, 0.84915602, 0.88114888, 0.55011002, 0.54256614,
       0.23719129, 0.13993389, 0.06873989, 0.50092118, 0.61709189])]
word indexes exist (non-zero count) in your test tweet: 
 [5 9]
Total likelihood probabilities of this test tweet: 
 [0.01168387721270875, 0.06263593364827179, 0.3603930382497615, 0.14636882043044644]
