In [444]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [445]:
def word_tokenization(text):
    vectorizer = CountVectorizer()
    tokenized_text = (vectorizer.fit_transform(text)).toarray()
    tokenized_names = vectorizer.get_feature_names()
    row, column = tokenized_text.shape
    features_text = np.zeros((row, column), dtype=object)
    for r in range(row):
        for c in range(column):
            features_text[r][c] = (tokenized_names[c], tokenized_text[r][c])
    return features_text

In [446]:
def prior_probabilities(q1_label):
    number_of_yes, number_of_no = 0, 0
    for i in range(len(q1_label)):
        if (q1_label[i] == "yes"):
            number_of_yes = number_of_yes + 1
        else:
            number_of_no = number_of_no + 1
    probability_of_yes, probability_of_no = (number_of_yes/len(q1_label)), (number_of_no/len(q1_label)) 
    return probability_of_yes, probability_of_no

In [447]:
def conditional_probabilities(text, q1_label):
    row = len(set(q1_label))
    column = text.shape[1]
    probabilities = np.zeros((row, column), dtype=object)
    
    total_row, total_column = text.shape
    total_words_in_yes, total_words_in_no = 0, 0
    for r in range(total_row):
        if (q1_label[r] == "yes"):
            for c in range(total_column):
                total_words_in_yes = total_words_in_yes + text[r][c][1]
        else: 
            for c in range(total_column):
                total_words_in_no = total_words_in_no + text[r][c][1]
    
    for c in range(column):
        number_of_word_in_yes, number_of_word_in_no = 0, 0
        for r in range(row):
            if (q1_label[r] == "yes"):
                number_of_word_in_yes = number_of_word_in_yes + text[r][c][1]
            else:
                number_of_word_in_no = number_of_word_in_no + text[r][c][1]
        
        #smoothing is necessary below
        #row 1 = yes
        probabilities[0][c] = number_of_word_in_yes/total_words_in_yes
        #row 2 = no
        probabilities[1][c] = number_of_word_in_no/total_words_in_no
    
    return probabilities

In [460]:
def data_processing(file):
    # hypothesis and evidence
    dataset = (pd.read_csv(file, sep='\t')).to_numpy()
    tweet_id = dataset[:,0]
    text = word_tokenization(dataset[:,1])
    q1_label = dataset[:,2]
    # prior probabilities
    yes, no = prior_probabilities(q1_label)
    conditionals = conditional_probabilities(text, q1_label)

In [461]:
# processing training set
training = '../data/covid_training.tsv'
data_processing(training)

(2, 3350)
