In [None]:
import os
import re
import numpy as np
from glob import glob
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [None]:
!wget https://raw.githubusercontent.com/Tou7and/slp-nutshell/main/text_classification/data/facebook_tsai/positive.txt
!wget https://raw.githubusercontent.com/Tou7and/slp-nutshell/main/text_classification/data/facebook_tsai/negative.txt

In [None]:
def tokenize_unigram(text):
    words = list(text)
    return words

def keep_mandarin(sent):
    pattern_zh = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]')
    results = pattern_zh.finditer(sent)

    zh_chars = []
    for result in results:
        # print(result.group(), result.span())
        zh_chars.append(result.group())
    sent_new = "".join(zh_chars)
    return sent_new


def load_data_from_file(text_file):
    with open(text_file, 'r') as reader:
        lines = reader.readlines()
    
    corpus = []
    for line in lines:
        corpus.append(keep_mandarin(line))
    return corpus


def load_sentiment_data_from_file(pos_file, neg_file):
    """ Return corpus and corresponding labels """
    pos_data = load_data_from_file(pos_file)
    neg_data = load_data_from_file(neg_file)

    # pos_train = pos_data[:len(pos_data)-100]
    pos_train = pos_data[:150]
    pos_test = pos_data[-100:]
    # neg_train = neg_data[:len(neg_data)-100]
    neg_train = neg_data[:150]
    neg_test = neg_data[-100:]

    corpus_train = pos_train + neg_train
    labels_train  = ["pos"]*len(pos_train) + ["neg"]*len(neg_train)
    corpus_test = pos_test + neg_test
    labels_test  = ["pos"]*len(pos_test) + ["neg"]*len(neg_test)

    dataset = {
        "train": (corpus_train, labels_train),
        "test": (corpus_test, labels_test),
    }
    return dataset



In [None]:

# Load Data!

dataset = load_sentiment_data_from_file(
    "./positive.txt",
    "./negative.txt"
)

train_corpus, train_labels = dataset["train"]
test_corpus, test_labels = dataset["test"]

vectorizer = CountVectorizer(tokenizer=tokenize_unigram, stop_words=["，","。", "\n", " "], min_df=min_df)

counts = vectorizer.fit_transform(train_corpus).toarray()

# print(vectorizer.get_feature_names_out())
# print(counts)


clf = DecisionTreeClassifier(random_state=0)

clf.fit(counts, train_labels)

test_counts = vectorizer.transform(test_corpus).toarray()
y_pred = []
for kk in test_corpus:
    kk_counts = vectorizer.transform([kk]).toarray()
    y_pred.append(clf.predict(kk_counts)[0])

print(accuracy_score(y_pred, test_labels))
if verbose > 0:
    print(test_labels)
    print(y_pred)

In [None]:
samples = [
    "政府實在過於無能",
    "政府很有效率",
    "阿不就好棒棒",
    "索尼罪大惡極 百姓怨聲載道",
]

# train_tfidf_gnb()
vectorizer, clf = train_bow(feat="unigram", cla="dctree")
for sample in samples:
    print(sample, clf.predict(vectorizer.transform([sample]).toarray()))

vectorizer, clf = train_bow(feat="bigram", cla="dctree")
for sample in samples:
    print(sample, clf.predict(vectorizer.transform([sample]).toarray()))