In [8]:
from nltk.stem import PorterStemmer
import re
import math
import csv
from collections import defaultdict

In [9]:
def preprocessing(document):
    tokens = document.split() 
    lowercase_tokens = [token.lower() for token in tokens] 

    stopwords = set()
    with open('./stopwords.txt', 'r', encoding='utf-8') as stopword_file:
        stopwords = set(stopword_file.read().splitlines())
    filtered_tokens = [token for token in lowercase_tokens if token not in stopwords]

    filtered_tokens_without_endings = [re.sub(r'[,.!?"@()%`\':;{}$&*-]+', '', token) for token in filtered_tokens]
    filtered_tokens_without_endings = [token for token in filtered_tokens_without_endings if token != '']

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens_without_endings]
    stemmed_tokens = [token for token in stemmed_tokens if token not in stopwords]

    words = [token for token in stemmed_tokens if not token.isdigit() and len(token) > 1]
    
    return words

In [10]:
def calculate_word_frequencies(training_data):
    term_category_freq = defaultdict(lambda: defaultdict(lambda: 0))
    term_present_num = defaultdict(lambda: 0)

    for category, doc_ids in training_data.items():
        for doc_id in doc_ids:
            with open(f'./data/{doc_id}.txt', "r", encoding="utf-8") as file:
                document = file.read()
                tokens = preprocessing(document)
                tokens = list(set(tokens))
                for term in tokens:
                    term_category_freq[category][term] += 1
                    term_present_num[term] += 1

    return term_category_freq, term_present_num

In [11]:
def calculate_likelihood_ratios(term_category_freq, term_present_num):
    total_doc_num = 195
    category_doc_num = 15

    word_freq_table = defaultdict(lambda: defaultdict(lambda: [0,0,0,0])) # n11, n01, n10, n00
    
    for category, category_freq in term_category_freq.items():
        for term, freq in category_freq.items():
            word_freq_table[category][term][0] = freq
            word_freq_table[category][term][1] = term_present_num[term] - freq
            word_freq_table[category][term][2] = category_doc_num - freq
            word_freq_table[category][term][3] = total_doc_num - term_present_num[term] - category_doc_num + freq
    
    likelihood_ratio_table = defaultdict(lambda: defaultdict(lambda: 0))
    
    for category, category_freq in word_freq_table.items():
        for term, freq in category_freq.items():
            n11, n01, n10, n00 = freq
            N = n11 + n10 + n01 + n00
            score = (((n11 + n01) / N) ** n11) * ((1 - ((n11 + n01) / N)) ** n10) * ((( n11 + n01) / N) ** n01) * ((1 - ((n11 + n01) / N)) ** n00)
            score /= ((n11 / (n11 + n10)) ** n11) * ((1 - (n11 / (n11 + n10))) ** n10) * ((n01 / (n01 + n00)) ** n01) * ((1 - (n01 / (n01 + n00))) ** n00)
            score = -2 * math.log(score)
            likelihood_ratio_table[category][term] = score

    top_words_class = {}
    select_feature_num = 30

    for category, term_score in likelihood_ratio_table.items():
        word = []
        for term, score in term_score.items():
            word.append([term, score])
        top_words_class[category] = word
    for category, words in top_words_class.items():
        words.sort(key=lambda x: x[1], reverse=True)
        top_words_class[category] = words[:select_feature_num]

    features = []
    for category, words in top_words_class.items():
        for i in range(len(words)):
            if words[i][0] not in features:
                features.append(words[i][0])

    return features

In [12]:
def train_classifier(training_data, features):
    total_docs = 195

    prior_probabilities = {category: len(docs) / total_docs for category, docs in training_data.items()}
    
    conditional_probabilities = defaultdict(lambda: defaultdict(lambda: 0))
    for category, docs in training_data.items():
        category_word_freq = defaultdict(lambda: 0)
        total_words = 0
        for doc in docs:
            with open(f'./data/{doc}.txt', "r", encoding="utf-8") as file:
                document = file.read()
                tokens = preprocessing(document)
                for word in tokens:
                    if word in features:
                        category_word_freq[word] += 1
                        total_words += 1
                    else:
                        total_words += 1
        
        for word in features:
            conditional_probabilities[category][word] = (category_word_freq[word] + 1) / total_words + len(features)

    return prior_probabilities, conditional_probabilities

In [13]:
def predict(document, prior_probabilities, conditional_probabilities, features):
    tokens = preprocessing(document)  
    probabilities = {}
    for category in prior_probabilities:
        log_prob = math.log(prior_probabilities[category])
        for word in tokens:
            if word in features:
                log_prob += math.log(conditional_probabilities[category][word])
        probabilities[category] = log_prob
    predicted_category = max(probabilities, key=probabilities.get)

    return predicted_category

In [14]:
training_data = {}
with open(f"./training.txt", "r", encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split()
        category = int(parts[0])
        document_ids = [int(id) for id in parts[1:]]
        training_data[category] = document_ids

term_category_freq, term_present_num = calculate_word_frequencies(training_data)
features = calculate_likelihood_ratios(term_category_freq, term_present_num)
prior_probabilities, conditional_probabilities = train_classifier(training_data, features)

predictions = []
for doc_id in range(1, 1096):
    if all(doc_id not in docs for docs in training_data.values()):
        with open(f'./data/{doc_id}.txt', 'r', encoding='utf-8') as file:
            document = file.read()
            predicted_class = predict(document, prior_probabilities, conditional_probabilities, features)
            predictions.append((doc_id, predicted_class))

with open('./predict.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Id', 'Value'])
    for doc_id, pred_class in predictions:
        writer.writerow([doc_id, pred_class])