In [None]:
import os
import re
import numpy as np
from glob import glob
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [None]:
!wget https://raw.githubusercontent.com/Tou7and/slp-nutshell/main/text_classification/data/facebook_tsai/positive.txt
!wget https://raw.githubusercontent.com/Tou7and/slp-nutshell/main/text_classification/data/facebook_tsai/negative.txt

In [None]:
def tokenize_unigram(text):
    words = list(text)
    return words

def keep_mandarin(sent):
    pattern_zh = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]')
    results = pattern_zh.finditer(sent)

    zh_chars = []
    for result in results:
        # print(result.group(), result.span())
        zh_chars.append(result.group())
    sent_new = "".join(zh_chars)
    return sent_new


def load_data_from_file(text_file):
    with open(text_file, 'r') as reader:
        lines = reader.readlines()
    
    corpus = []
    for line in lines:
        corpus.append(keep_mandarin(line))
    return corpus

def load_sentiment_data_from_file(pos_file, neg_file):
    """ Return corpus and corresponding labels """
    pos_data = load_data_from_file(pos_file)
    neg_data = load_data_from_file(neg_file)

    # pos_train = pos_data[:len(pos_data)-100]
    pos_train = pos_data[:150]
    pos_test = pos_data[-100:]
    # neg_train = neg_data[:len(neg_data)-100]
    neg_train = neg_data[:150]
    neg_test = neg_data[-100:]

    corpus_train = pos_train + neg_train
    labels_train  = ["pos"]*len(pos_train) + ["neg"]*len(neg_train)
    corpus_test = pos_test + neg_test
    labels_test  = ["pos"]*len(pos_test) + ["neg"]*len(neg_test)

    dataset = {
        "train": (corpus_train, labels_train),
        "test": (corpus_test, labels_test),
    }
    return dataset

In [None]:
# Load Data
dataset = load_sentiment_data_from_file(
    "./positive.txt",
    "./negative.txt"
)

# Split training and testing set
train_corpus, train_labels = dataset["train"]
test_corpus, test_labels = dataset["test"]

# Check training data
print(train_corpus[:5], "\n", train_corpus[-5:])
print(train_labels[:5], "\n", train_labels[-5:])

In [None]:
# 特徵: 使用 scikit learn 的 CountVectorizer (Bag-of-words)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

# 初始化
vectorizer = CountVectorizer(tokenizer=tokenize_unigram, min_df=2)

# just fit it
vectorizer.fit(train_corpus)

# 觀察學到的 feature
print(vectorizer.get_feature_names_out().tolist())

In [None]:
# Train a DecisionTree
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

# 用剛剛得到的 vectorizer 萃取特徵
training_feats = vectorizer.transform(train_corpus)

clf = DecisionTreeClassifier(random_state=0)

clf.fit(training_feats, train_labels)

test_counts = vectorizer.transform(test_corpus).toarray()
y_pred = []
for kk in test_corpus:
    kk_counts = vectorizer.transform([kk]).toarray()
    y_pred.append(clf.predict(kk_counts)[0])

# 看看 預測結果 跟 標準答案 的差距
print("Accuracy on testing set: ", accuracy_score(y_pred, test_labels))

In [None]:
# Inference some real samples
samples = [
    "政府實在過於無能",
    "政府很有效率",
    "阿不就好棒棒",
    "索尼罪大惡極 百姓怨聲載道",
]

for sample in samples:
    print(sample, clf.predict(vectorizer.transform([sample]).toarray()))

In [None]:
# Practice: Train a better classifier

# Hint 1: try new features, ex. bag-of-bigram.
# or try any other features you think of or find.
def tokenize_bigram(text):
    # Try to implement bigram
    # Hints: loop through the text
    bigrams = []
    return bigrams

vectorizer2 = CountVectorizer(tokenizer=tokenize_bigram, min_df=2)

vectorizer2.fit(train_corpus)

training_feats = vectorizer2.transform(train_corpus)
# print(vectorizer2.get_feature_names_out().tolist())

# Hint 2: try new classifiers
# You can replace the Decision Tree with any other classifiers on scikit learn
# https://scikit-learn.org/stable/auto_examples/calibration/plot_compare_calibration.html
clf2 = DecisionTreeClassifier(random_state=0)

clf2.fit(training_feats, train_labels)

test_counts = vectorizer2.transform(test_corpus).toarray()
y_pred = []
for kk in test_corpus:
    kk_counts = vectorizer2.transform([kk]).toarray()
    y_pred.append(clf2.predict(kk_counts)[0])

print("Accuracy on testing set: ", accuracy_score(y_pred, test_labels))

samples = [
    "政府實在過於無能",
    "政府很有效率",
    "阿不就好棒棒",
    "索尼罪大惡極 百姓怨聲載道",
]

print()
for sample in samples:
    print(sample, clf2.predict(vectorizer2.transform([sample]).toarray()))