-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_naive_bayes.py
112 lines (90 loc) · 4.69 KB
/
train_naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
""" TrainNaiveBayes Class
Trains the Sentiment Analysis model.
Author: Samiha Fansur, The University Of Sheffield, 2023
"""
class TrainNaiveBayes:
def __init__(self, words, sentiment_class_list):
"""
Initialising class.
Args:
words (list): Words list from training data.
sentiment_class_list list[int]: Sentiment class.
"""
self.sentiment_class_list = sentiment_class_list
self.number_of_words = len(words)
compute_training_data = self.train(words, sentiment_class_list)
# Setting attributes using computed training data
self.sentence_count_in_sentiment = compute_training_data["sentence_count_in_sentiment "]
self.number_token_sentiments = compute_training_data["number_token_sentiments"]
self.token_counts_in_sentiment = compute_training_data["token_counts_in_sentiment "]
self.magnitude_of_vocab = compute_training_data["magnitude_of_vocab"]
def initialize_counters(self, sentiment_class_list):
"""
Initializing counters for every class.
Args:
sentiment_class_list list[int]: Sentiment class.
Returns:
tuple: Tuple which contains initialised counters for number of words, token count,
and a list of dictionaries for relative frequencies.
"""
word_count = [0] * len(sentiment_class_list) # count of words for each sentiment class
token_count = [0] * len(sentiment_class_list) # total token count for each sentiment class
# dictionary to story relative token counts for each sentiment class
rel_token_count_dict = [dict() for _ in range(len(sentiment_class_list))]
#creating a tuple
return word_count, token_count, rel_token_count_dict
def process_sentiment_tokens(self, sentiment, tokens, word_count, token_count, rel_token_count_dict):
"""
Update counters after processing tokens for each sentiment class.
Args:
sentiment (int): Sentiment class of current tokens.
tokens (list): List of tokens for processing.
word_count (list): Number of words in every class.
token_count (list): Token count in every sentiment class.
rel_token_count_dict (list of dicts): Relative frequency of each token for every sentiment class.
"""
word_count[sentiment] += 1 # Increment word count for a given sentiment
token_count[sentiment] += len(tokens) # Increment token count for a given sentiment
# For each token in the set of token, count the number of occurrences
# of each token in the given sentiment
for token in tokens:
# If the token exists in dictionary, increment its count, else add it with a count of 1.
rel_token_count_dict[sentiment].setdefault(token, 0)
rel_token_count_dict[sentiment][token] += 1
def calculate_vocab_size(self, words):
"""
Calculates vocabulary size in dataset.
Args:
words (list): Words list from training data.
Returns:
int: Vocabulary size.
"""
# For each token for every word in words list, add token to set. Using set to avoid duplicates
unique_tokens = set(token for word in words for token in word.tokens)
return len(unique_tokens) # vocabulary size
def train(self, words, sentiment_scale):
"""
Giving the model words and number of classes to train it.
Args:
words (list): Words list from training data.
sentiment_scale (int): Count of sentiment classes from dataset.
Returns:
dict: A dictionary that stores computed training data.
"""
# Initialize variables for each sentiment class
word_count, token_count, rel_token_count_dict = self.initialize_counters(sentiment_scale)
# Getting sentiment and tokens for every word
sentiment_tokens = [(word.sentiment, word.tokens) for word in words]
# Process each sentiment-token pair
for sentiment, tokens in sentiment_tokens:
self.process_sentiment_tokens(sentiment, tokens, word_count, token_count, rel_token_count_dict)
# Calculating the number of unique tokens (vocabulary)
magnitude_of_vocab = self.calculate_vocab_size(words)
# Return all the combined computed training data as a dictionary
return {
"sentence_count_in_sentiment ": word_count,
"number_token_sentiments": token_count,
"token_counts_in_sentiment ": rel_token_count_dict,
"magnitude_of_vocab": magnitude_of_vocab,
}