# Naive Bayes Classifier NB-BOW-OV

In [1]:
import numpy as np
import csv
import math

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#*** TRAINING ***
train_file = open('covid_training.tsv', 'r', encoding='utf-8')
tweets = {}
vocabulary = []

next(train_file) # skip header

for line in train_file:
    # split tsv lines
    split_line = line.split("\t")
    
    # get tweet data and fold to lower case
    tweet_id = split_line[0]
    tweet_text = split_line[1].lower()
    q_label = split_line[2]
    
    # get tweet text
    split_text = tweet_text.split(" ")
    tweet_vector = {}
    for token in split_text:
        # add term to tweet_vector and count frequency
        if token not in tweet_vector.keys():
            vocabulary.append(token)
            tweet_vector[token] = 1
        else:
            tweet_vector[token] += 1    
            
    # add tweet vector and label to tweets dictionary
    tweets[tweet_id] = (tweet_vector, q_label)
    
# no duplicates
vocabulary = sorted(set(vocabulary))
#print(len(vocabulary))
#print(vocabulary)


In [2]:
# *** Compute Probabilities and TESTING for NB-BOW-OV ***

#0. Count the unique set of words present in each class
count_set_c1 = 0 # count set of words present when class is 'yes' - will be used in the conditional probabilities
words_frequency_c1 = [] # words in class c1 with frequencies (from all tweets)
words_c1 = [] # only words with no frequencies (from all tweets)
data_instances_c1 = 0 # number of data instances with class 'yes' (from all tweets)

count_set_c2 = 0 # count set of words present when class is 'no' - will be used in the conditional probabilities
words_frequency_c2 = []  # words in class c2 with frequencies (from all tweets)
words_c2 = [] # only words with no frequencies (from all tweets)
data_instances_c2 = 0 # number of data instances with class 'no' (from all tweets)

for i in tweets:
    # c1: Check each tweet where q1 label=yes
    if tweets[i][1] == "yes":
        data_instances_c1+=1
        words = list(tweets[i][0].keys())
        values = list(tweets[i][0].values())
        pairs = zip(words,values)
        for entry in pairs:
            words_frequency_c1.append(entry)
            words_c1.append(entry[0])
            
    # c2: Check each tweet where q1 label=no        
    elif tweets[i][1] == "no":
        data_instances_c2+=1
        words = list(tweets[i][0].keys())
        values = list(tweets[i][0].values())
        pairs = zip(words,values)
        for entry in pairs:
            words_frequency_c2.append(entry)   
            words_c2.append(entry[0])
        
count_set_c1 = len(set(words_c1)) # c1 set - no duplicates even from other tweets
#print(words_frequency_c1)
#print(len(words_c1))
#print(count_set_c1)
#print(data_instances_c1)
count_set_c2 = len(set(words_c2)) # c2 set - no duplicates even from other tweets
#print(words_frequency_c2)
#print(len(words_c2))
#print(count_set_c2)
#print(data_instances_c2)

#1. Prior Probabilities
# How many instances of that class, over all the tweets
number_tweets = len(tweets)
priorprob_c1 = data_instances_c1/number_tweets
priorprob_c2 = data_instances_c2/number_tweets

#2. TEST data, Conditional Probabilities and SCORE(class)
# notice that some words in the tweet may not be part of the vocabulary
# TEST DATA
test_file = open('covid_test_public.tsv', 'r', encoding='utf-8')
test_tweets = {}
for line in test_file:
    # split tsv lines
    split_line = line.split("\t")
    # get tweet data and fold to lower case
    tweet_id = split_line[0]
    tweet_text = split_line[1].lower()
    expected_q_label = split_line[2]
    test_tweets[tweet_id] = (tweet_text, expected_q_label)

print(len(test_tweets))

# For Overall Evaluation
expected_labels_test = [] 
predicted_labels = []
# TESTING TWEETS
for key in test_tweets.keys():
    test_example = test_tweets[key][0]
    expected_class = test_tweets[key][1]
    
    #test_example = 'Panic buying and stockpiling of toilet roll continues. These are the scenes at Costco in Farnborough in Hampshire today. #coronavirus #toiletpaperpanic #panickbuying https://t.co/JLlZVVS7EH'.lower()
    print(test_example)
    splited_text = test_example.split(" ")


    # We don't compute all the conditional probabilities... only do so when determining score() for new tweet
    # ADD smoothing and log10
    delta = 0.01
    
    # *** SCORE(c1) - class 'yes' ***
    conditionalprob_c1 = []
    for word in splited_text:
        word_count = 0 # count frequency for each word in the class c1
        if word in vocabulary: #find the conditional probability of this word and add it to the list
            for entry in words_frequency_c1: # find the frequency of the word in all tweets with class c1
                if (entry[0] == word) or (entry[0] == word+'.') or (entry[0] == word+',') or (entry[0] == word+';'):
                    word_count+=entry[1]
            #add the conditional probability
            conditionalprob_c1.append( (word_count+delta) / (count_set_c1+(len(vocabulary)*delta)) )
            #print(word_count)
            #print((word_count+delta / (count_set_c1+(len(vocabulary)*delta))), '\n')
    #SCORE(c1)
    score_c1 = 0
    for probability in conditionalprob_c1:
         score_c1 = score_c1 + math.log10(probability) 
    # Finally, multiply by the Prior probability
    score_c1 = score_c1 + math.log10(priorprob_c1)
    print("score(c1): ", score_c1)

    # *** SCORE(c2) - class 'no' ***
    conditionalprob_c2 = []
    for word in splited_text:
        word_count = 0 # count frequency for each word in the class c2
        if word in vocabulary: #find the conditional probability of this word and add it to the list
            for entry in words_frequency_c2: # find the frequency of the word in all tweets with class c2
                if (entry[0] == word) or (entry[0] == word+'.') or (entry[0] == word+',') or (entry[0] == word+';'):
                    word_count+=entry[1]
            #add the conditional probability
            conditionalprob_c2.append( (word_count+delta) / (count_set_c2+(len(vocabulary)*delta)) )
            #print(word_count)
            #print((word_count+delta / (count_set_c2+(len(vocabulary)*delta))), '\n')
    #SCORE(c2)
    score_c2 = 0
    for probability in conditionalprob_c2:
         score_c2 = score_c2 + math.log10(probability) 
    # Finally, multiply by the Prior probability
    score_c2 = score_c2 + math.log10(priorprob_c2)
    print("score(c2): ", score_c2)

    # argmax(score_c1, score_c2)
    if max(score_c1, score_c2) == score_c1:
        print("Prediction: the class of that tweet is c1: yes ")
        prediction = "yes"
        score = score_c1
    else:
        print("Prediction: the class of that tweet is c2: no")
        prediction = "no"
        score = score_c2
        
    # WHAT WAS THE EXPECTED CLASS?
    print("Expected class: ", expected_class)
    
    # Correct or Wrong prediction?
    if prediction == expected_class:
        result = "correct"
    else:
        result = "wrong"
    
    #4. WRITE TRACE FILES
    outFile = open('trace_NB-BOW-OV.txt', "a")
    outFile.write(key)
    outFile.write(", ")
    outFile.write(prediction)
    outFile.write(", ")
    outFile.write(str(score))
    outFile.write(", ")
    outFile.write(expected_class)
    outFile.write(", ")
    outFile.write(result)
    outFile.write("\n")
    #close()
    outFile.close()
    
    #5. FOR THE OVERALL EVALUATION FILES
    expected_labels_test.append(expected_class)
    predicted_labels.append(prediction)
    print("\n")
    

# WRITE OVERALL EVALUATION FILES
# sprecision, recall, f1-score, accuracy, macro avg, weighted average, and support
print("\n" + "Classification report:")
NB_report = classification_report(expected_labels_test, predicted_labels)
print(NB_report)

outputFile = open('eval_NB-BOW-OV.txt', "a")
outputFile.write(NB_report)
outputFile.close()



55
1/ many of you ask me why i take the covid-19 outbreak so seriously. current numbers of cases and deaths are *not* why. ð a thread on why iâm worried and what i do personally in this situation.
score(c1):  -78.97564227154574
score(c2):  -81.27271139396795
Prediction: the class of that tweet is c1: yes 
Expected class:  no


panic buying and stockpiling of toilet roll continues. these are the scenes at costco in farnborough in hampshire today. #coronavirus #toiletpaperpanic #panickbuying https://t.co/jllzvvs7eh
score(c1):  -26.97884906993295
score(c2):  -32.613164900885295
Prediction: the class of that tweet is c1: yes 
Expected class:  yes


everyone can help prevent the spread of #covid19. call your doctor if you develop symptoms, have been in close contact with a person known to have covid-19, or have recently traveled from an area with widespread or ongoing community spread of covid-19. https://t.co/ehl8kmrhan. https://t.co/kwrko7vnub
score(c1):  -89.17172918102771
score(c2)

score(c1):  -79.72820137802172
score(c2):  -96.03774668872032
Prediction: the class of that tweet is c1: yes 
Expected class:  yes


we donât yet have all the tools we need to fight covid-19. this is an important step toward having treatments, while we also explore vaccines and diagnostics. thanks to @wellcometrust and @mastercard for launching this effort with us. https://t.co/m8aj3083zk
score(c1):  -68.65121342145002
score(c2):  -77.97268037572728
Prediction: the class of that tweet is c1: yes 
Expected class:  yes


how should we approach potential covid-19 cases? here's a infographic i made based on the 09 mar 2020 guidelines of the doh-psmid taskforce. hopefully this helps us hcps as we face the growing threat. stay safe everyone. https://t.co/cjvvm52yx0
score(c1):  -57.37116613406181
score(c2):  -59.92869500414514
Prediction: the class of that tweet is c1: yes 
Expected class:  no


the media is celebrating conservatives getting the coronavirus. over 5,000 articles have been wr

score(c1):  -79.26658984614191
score(c2):  -91.40717769998597
Prediction: the class of that tweet is c1: yes 
Expected class:  yes


the world health organisation want you to know some dos and donts about #wuhanvirus #coronavirus luckily hopkins is here to help watch and learn https://t.co/1ecjd6znu9
score(c1):  -42.765532442960144
score(c2):  -40.92363786383935
Prediction: the class of that tweet is c2: no
Expected class:  no


â ï¸doctors in #italy warn europe to âget readyâ for #coronavirus, saying ~10% of #covid19 patients need icu care, and hospitals are overwhelmed. âincrease your total icu capacity...prepare icu areas (for) patients â in every hospital if necessary.â https://t.co/vohqj1ifxf
score(c1):  -55.72416802449047
score(c2):  -65.04317980710094
Prediction: the class of that tweet is c1: yes 
Expected class:  yes



Classification report:
              precision    recall  f1-score   support

          no       0.71      0.23      0.34        22
         yes   

# NB-BOW-FV  (Filtered Vocabulary)

In [3]:
#*** TRAINING ***
train_file = open('covid_training.tsv', 'r', encoding='utf-8')
tweets = {}
vocabulary = [] # will only have words that appear at least twice
word_bag = {} # will be used to filter the vocabulary

next(train_file) # skip header

for line in train_file:
    # split tsv lines
    split_line = line.split("\t")
    
    # get tweet data and fold to lower case
    tweet_id = split_line[0]
    tweet_text = split_line[1].lower()
    q_label = split_line[2]
    
    # get tweet text
    split_text = tweet_text.split(" ")
    tweet_vector = {}
    for token in split_text:
        # add term to tweet_vector and count frequency
        if token not in tweet_vector.keys():
            tweet_vector[token] = 1
        else:
            tweet_vector[token] += 1
        
        # let's keep track of the total count of words
        if token not in word_bag.keys():
            word_bag[token] = 1
        else:
            word_bag[token] += 1
            
    # add tweet vector and label to tweets dictionary
    tweets[tweet_id] = (tweet_vector, q_label)

# word_bag is essentially the original vocabulary before being filtered
#print(len(set(word_bag))) 
#print(word_bag)   
   
for word in word_bag:
    #print(word, word_bag[word])
    if word_bag[word] >= 2: 
        vocabulary.append(word)

vocabulary = sorted(set(vocabulary))
#print(len(vocabulary))
#print(vocabulary)


In [4]:
# *** Compute Probabilities and TESTING for NB-BOW-FV ***

#0. Count the unique set of words present in each class
count_set_c1 = 0 # count set of words present when class is 'yes' - will be used in the conditional probabilities
words_frequency_c1 = [] # words in class c1 with frequencies (from all tweets)
words_c1 = [] # only words with no frequencies (from all tweets)
data_instances_c1 = 0 # number of data instances with class 'yes' (from all tweets)

count_set_c2 = 0 # count set of words present when class is 'no' - will be used in the conditional probabilities
words_frequency_c2 = []  # words in class c2 with frequencies (from all tweets)
words_c2 = [] # only words with no frequencies (from all tweets)
data_instances_c2 = 0 # number of data instances with class 'no' (from all tweets)

for i in tweets:
    # c1: Check each tweet where q1 label=yes
    if tweets[i][1] == "yes":
        data_instances_c1+=1
        words = list(tweets[i][0].keys())
        values = list(tweets[i][0].values())
        pairs = zip(words,values)
        for entry in pairs:
            words_frequency_c1.append(entry)
            words_c1.append(entry[0])
            
    # c2: Check each tweet where q1 label=no        
    elif tweets[i][1] == "no":
        data_instances_c2+=1
        words = list(tweets[i][0].keys())
        values = list(tweets[i][0].values())
        pairs = zip(words,values)
        for entry in pairs:
            words_frequency_c2.append(entry)   
            words_c2.append(entry[0])
        
count_set_c1 = len(set(words_c1)) # c1 set - no duplicates even from other tweets
#print(words_frequency_c1)
#print(len(words_c1))
#print(count_set_c1)
#print(data_instances_c1)
count_set_c2 = len(set(words_c2)) # c2 set - no duplicates even from other tweets
#print(words_frequency_c2)
#print(len(words_c2))
#print(count_set_c2)
#print(data_instances_c2)

#1. Prior Probabilities
# How many instances of that class, over all the tweets
number_tweets = len(tweets)
priorprob_c1 = data_instances_c1/number_tweets
priorprob_c2 = data_instances_c2/number_tweets

#2. TEST data, Conditional Probabilities and SCORE(class)
# notice that some words in the tweet may not be part of the vocabulary
# TEST DATA
test_file = open('covid_test_public.tsv', 'r', encoding='utf-8')
test_tweets = {}
for line in test_file:
    # split tsv lines
    split_line = line.split("\t")
    # get tweet data and fold to lower case
    tweet_id = split_line[0]
    tweet_text = split_line[1].lower()
    expected_q_label = split_line[2]
    test_tweets[tweet_id] = (tweet_text, expected_q_label)

print(len(test_tweets))

# For Overall Evaluation
expected_labels_test = [] 
predicted_labels = []
# TESTING TWEETS
for key in test_tweets.keys():
    test_example = test_tweets[key][0]
    expected_class = test_tweets[key][1]
    
    #test_example = 'Panic buying and stockpiling of toilet roll continues. These are the scenes at Costco in Farnborough in Hampshire today. #coronavirus #toiletpaperpanic #panickbuying https://t.co/JLlZVVS7EH'.lower()
    print(test_example)
    splited_text = test_example.split(" ")


    # We don't compute all the conditional probabilities... only do so when determining score() for new tweet
    # ADD smoothing and log10
    delta = 0.01
    
    # *** SCORE(c1) - class 'yes' ***
    conditionalprob_c1 = []
    for word in splited_text:
        word_count = 0 # count frequency for each word in the class c1
        if word in vocabulary: #find the conditional probability of this word and add it to the list
            for entry in words_frequency_c1: # find the frequency of the word in all tweets with class c1
                if (entry[0] == word) or (entry[0] == word+'.') or (entry[0] == word+',') or (entry[0] == word+';'):
                    word_count+=entry[1]
            #add the conditional probability
            conditionalprob_c1.append( (word_count+delta) / (count_set_c1+(len(vocabulary)*delta)) )
            #print(word_count)
            #print((word_count+delta / (count_set_c1+(len(vocabulary)*delta))), '\n')
    #SCORE(c1)
    score_c1 = 0
    for probability in conditionalprob_c1:
         score_c1 = score_c1 + math.log10(probability) 
    # Finally, multiply by the Prior probability
    score_c1 = score_c1 + math.log10(priorprob_c1)
    print("score(c1): ", score_c1)

    # *** SCORE(c2) - class 'no' ***
    conditionalprob_c2 = []
    for word in splited_text:
        word_count = 0 # count frequency for each word in the class c2
        if word in vocabulary: #find the conditional probability of this word and add it to the list
            for entry in words_frequency_c2: # find the frequency of the word in all tweets with class c2
                if (entry[0] == word) or (entry[0] == word+'.') or (entry[0] == word+',') or (entry[0] == word+';'):
                    word_count+=entry[1]
            #add the conditional probability
            conditionalprob_c2.append( (word_count+delta) / (count_set_c2+(len(vocabulary)*delta)) )
            #print(word_count)
            #print((word_count+delta / (count_set_c2+(len(vocabulary)*delta))), '\n')
    #SCORE(c2)
    score_c2 = 0
    for probability in conditionalprob_c2:
         score_c2 = score_c2 + math.log10(probability) 
    # Finally, multiply by the Prior probability
    score_c2 = score_c2 + math.log10(priorprob_c2)
    print("score(c2): ", score_c2)

    # argmax(score_c1, score_c2)
    if max(score_c1, score_c2) == score_c1:
        print("Prediction: the class of that tweet is c1: yes ")
        prediction = "yes"
        score = score_c1
    else:
        print("Prediction: the class of that tweet is c2: no")
        prediction = "no"
        score = score_c2
        
    # WHAT WAS THE EXPECTED CLASS?
    print("Expected class: ", expected_class)
    
    # Correct or Wrong prediction?
    if prediction == expected_class:
        result = "correct"
    else:
        result = "wrong"
    
    #4. WRITE TRACE FILES
    outFile = open('trace_NB-BOW-FV.txt', "a")
    outFile.write(key)
    outFile.write(", ")
    outFile.write(prediction)
    outFile.write(", ")
    outFile.write(str(score))
    outFile.write(", ")
    outFile.write(expected_class)
    outFile.write(", ")
    outFile.write(result)
    outFile.write("\n")
    #close()
    outFile.close()
    
    #5. FOR THE OVERALL EVALUATION FILES
    expected_labels_test.append(expected_class)
    predicted_labels.append(prediction)
    print("\n")
    

# WRITE OVERALL EVALUATION FILES
# sprecision, recall, f1-score, accuracy, macro avg, weighted average, and support
print("\n" + "Classification report:")
NB_report = classification_report(expected_labels_test, predicted_labels)
print(NB_report)

outputFile = open('eval_NB-BOW-FV.txt', "a")
outputFile.write(NB_report)
outputFile.close()

55
1/ many of you ask me why i take the covid-19 outbreak so seriously. current numbers of cases and deaths are *not* why. ð a thread on why iâm worried and what i do personally in this situation.
score(c1):  -60.758438210315376
score(c2):  -64.13758436312457
Prediction: the class of that tweet is c1: yes 
Expected class:  no


panic buying and stockpiling of toilet roll continues. these are the scenes at costco in farnborough in hampshire today. #coronavirus #toiletpaperpanic #panickbuying https://t.co/jllzvvs7eh
score(c1):  -23.706367527769743
score(c2):  -27.288704003628677
Prediction: the class of that tweet is c1: yes 
Expected class:  yes


everyone can help prevent the spread of #covid19. call your doctor if you develop symptoms, have been in close contact with a person known to have covid-19, or have recently traveled from an area with widespread or ongoing community spread of covid-19. https://t.co/ehl8kmrhan. https://t.co/kwrko7vnub
score(c1):  -77.97587400896919
score(c

score(c1):  -72.54503440560951
score(c2):  -85.30773341493459
Prediction: the class of that tweet is c1: yes 
Expected class:  yes


we donât yet have all the tools we need to fight covid-19. this is an important step toward having treatments, while we also explore vaccines and diagnostics. thanks to @wellcometrust and @mastercard for launching this effort with us. https://t.co/m8aj3083zk
score(c1):  -52.45451254232635
score(c2):  -58.86566856281028
Prediction: the class of that tweet is c1: yes 
Expected class:  yes


how should we approach potential covid-19 cases? here's a infographic i made based on the 09 mar 2020 guidelines of the doh-psmid taskforce. hopefully this helps us hcps as we face the growing threat. stay safe everyone. https://t.co/cjvvm52yx0
score(c1):  -46.717779237873756
score(c2):  -44.076949531284065
Prediction: the class of that tweet is c2: no
Expected class:  no


the media is celebrating conservatives getting the coronavirus. over 5,000 articles have been wr

score(c1):  -66.57247250744214
score(c2):  -77.4867712695623
Prediction: the class of that tweet is c1: yes 
Expected class:  yes


the world health organisation want you to know some dos and donts about #wuhanvirus #coronavirus luckily hopkins is here to help watch and learn https://t.co/1ecjd6znu9
score(c1):  -37.1655821308904
score(c2):  -37.55484345280185
Prediction: the class of that tweet is c1: yes 
Expected class:  no


â ï¸doctors in #italy warn europe to âget readyâ for #coronavirus, saying ~10% of #covid19 patients need icu care, and hospitals are overwhelmed. âincrease your total icu capacity...prepare icu areas (for) patients â in every hospital if necessary.â https://t.co/vohqj1ifxf
score(c1):  -38.635045250163216
score(c2):  -44.77442365511045
Prediction: the class of that tweet is c1: yes 
Expected class:  yes



Classification report:
              precision    recall  f1-score   support

          no       0.89      0.36      0.52        22
         yes   