In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### 1. Read data CSV and split into train and test sets

In [2]:
path = f"{os.getcwd()}/data/training-data.csv"
data = pd.read_csv(path, on_bad_lines="skip", delimiter=';')

# Split the data 80/20 for training and testing
train_data, test_data = train_test_split(
  data,
  train_size=0.8,
  test_size=0.2,
  random_state=42
)

### 2. Calculate initial probability for each label

In [3]:
labels = train_data.label
total_data_frequency = len(labels)
label_frequencies = labels.value_counts().to_numpy()
initial_probability = np.array([frequency / total_data_frequency for frequency in label_frequencies])

### 3. Count how many times word X appears in label 0, 1, 2

In [4]:
token_freq_map = {}
token_freq_per_label = [0, 0 ,0]
for i, row in train_data.iterrows():
  label = row.label
  tokens = str(row.text).split()

  # Add the tokens count for the label
  token_freq_per_label[label] += len(tokens)

  for token in tokens:
    # label 0, 1, 2 is labeled by the array's index, whilst the element is the count. e.g. map["I"][0] -> how many times token "I" appear  in label 0
    if token not in token_freq_map:
      # Initialize empty list for non existing tokens
      token_freq_map[token] = [0, 0, 0]

    token_freq_map[token][label] += 1


### 4. Calculate the probability of word X in label 0, 1, 2

In [5]:
token_probabilities = {}
token_length = len(token_freq_map)
SMOOTHING = 1
for token, occurences in token_freq_map.items():
  token_probabilities[token] = []
  for label, occurence in enumerate(occurences):
    # p(w | L) = (Count(w,L) + a) / TotalWords(L) + α·V
    probability = (occurence + SMOOTHING) / (token_freq_per_label[label] + SMOOTHING * token_length)
    token_probabilities[token].append(probability)

### 5. Test the model

In [6]:
def classify(text):
  scores = initial_probability.copy()
  tokens = text.split()

  # Calculate probability for each label
  for label in range(len(scores)):
    for token in tokens:
      if token in token_probabilities:
        scores[label] *= token_probabilities[token][label]

  # Retrieve the largest guess score's index (AKA. the label)
  predicted_label = np.argmax(scores)
  return predicted_label

# Go through the test data
predictions = [classify(data.text) for _, data in test_data.iterrows()]
actual = [data.label for _, data in test_data.iterrows()]

print("PREDICTION =>", predictions)
print("ACTUAL => ",actual)
print(classification_report(actual, predictions, labels=[0,1,2]))


PREDICTION => [1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 0, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 0, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 0, 2, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 0, 1, 0, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 0, 1, 2, 1, 1, 2, 0, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 0, 1, 1, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 0, 2, 1, 2, 1, 0, 0, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 2, 1, 2, 2, 1, 0, 1, 1, 0, 1, 2, 1, 2, 2, 1, 0, 1, 1, 1, 1, 2, 2, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 2, 1, 0, 1, 0, 2, 1, 1, 2, 1, 0, 2, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 2, 0, 1, 1, 2, 2, 2, 1, 2, 1, 1, 0, 1, 0, 1, 2, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 1, 1, 1, 2, 1, 1, 1, 2, 1, 0, 2, 0, 0, 0, 2, 2, 1, 2, 2, 0, 2, 2, 1, 2, 1