In [1]:
import os
import re
import numpy as np
from glob import glob
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [None]:
!wget https://raw.githubusercontent.com/Tou7and/slp-nutshell/main/text_classification/data/facebook_tsai/positive.txt
!wget https://raw.githubusercontent.com/Tou7and/slp-nutshell/main/text_classification/data/facebook_tsai/negative.txt

In [2]:
def tokenize_unigram(text):
    words = list(text)
    return words

def keep_mandarin(sent):
    pattern_zh = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]')
    results = pattern_zh.finditer(sent)

    zh_chars = []
    for result in results:
        # print(result.group(), result.span())
        zh_chars.append(result.group())
    sent_new = "".join(zh_chars)
    return sent_new


def load_data_from_file(text_file):
    with open(text_file, 'r') as reader:
        lines = reader.readlines()
    
    corpus = []
    for line in lines:
        corpus.append(keep_mandarin(line))
    return corpus

def load_sentiment_data_from_file(pos_file, neg_file):
    """ Return corpus and corresponding labels """
    pos_data = load_data_from_file(pos_file)
    neg_data = load_data_from_file(neg_file)

    # pos_train = pos_data[:len(pos_data)-100]
    pos_train = pos_data[:150]
    pos_test = pos_data[-100:]
    # neg_train = neg_data[:len(neg_data)-100]
    neg_train = neg_data[:150]
    neg_test = neg_data[-100:]

    corpus_train = pos_train + neg_train
    labels_train  = ["pos"]*len(pos_train) + ["neg"]*len(neg_train)
    corpus_test = pos_test + neg_test
    labels_test  = ["pos"]*len(pos_test) + ["neg"]*len(neg_test)

    dataset = {
        "train": (corpus_train, labels_train),
        "test": (corpus_test, labels_test),
    }
    return dataset

In [3]:
# Load Data
dataset = load_sentiment_data_from_file(
    "./positive.txt",
    "./negative.txt"
)

# Split training and testing set
train_corpus, train_labels = dataset["train"]
test_corpus, test_labels = dataset["test"]

# Check training data
print(train_corpus[:5], "\n", train_corpus[-5:])
print(train_labels[:5], "\n", train_labels[-5:])

['哈哈小英總統真的好帥', '辛苦了', '小英總統加油', '基隆的資深優質立委', '智仁勇'] 
 ['你當總統候讓我很後悔投你一票當初所言的你做到了什麼還不是跟國民黨一樣肥缺都是身邊的親戚來當到現在沒看過你做的有當擔的事情太平島你卻是軟弱的人', '原來童軍也是藍綠戰場', '懸賞一億檢舉國民黨黨產都上任了還把黨派鬥爭放在第一位', '您好不要怪人民是自己沒有把事情做好都已經是大人了是非善惡因該要知道政府要対抗的不是人民是外來的國家侵略等自己要怎庅領導國家也要懂感恩阿彌陀佛', '當然新政府不一樣不護漁不護國土也不願自己少領']
['pos', 'pos', 'pos', 'pos', 'pos'] 
 ['neg', 'neg', 'neg', 'neg', 'neg']


In [4]:
# 特徵: 使用 scikit learn 的 CountVectorizer (Bag-of-words)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

# 初始化
vectorizer = CountVectorizer(tokenizer=tokenize_unigram, min_df=2)

# just fit it
vectorizer.fit(train_corpus)

# 觀察學到的 feature
print(vectorizer.get_feature_names_out().tolist())

['一', '丁', '七', '三', '上', '下', '不', '且', '世', '並', '中', '主', '之', '九', '也', '亂', '了', '事', '二', '五', '些', '亡', '交', '享', '亮', '人', '什', '仁', '今', '介', '他', '付', '代', '令', '以', '件', '任', '份', '企', '伉', '伐', '休', '但', '佈', '位', '低', '住', '佑', '何', '佛', '作', '你', '佳', '來', '例', '依', '便', '係', '促', '保', '信', '個', '們', '倒', '候', '借', '假', '做', '停', '健', '偵', '備', '傲', '傳', '傷', '傻', '像', '億', '優', '儷', '元', '先', '光', '克', '免', '兒', '兔', '入', '內', '全', '兩', '八', '公', '六', '共', '兵', '其', '兼', '再', '凱', '出', '分', '切', '初', '判', '別', '利', '到', '制', '刻', '則', '前', '副', '劃', '劇', '力', '功', '加', '助', '努', '勇', '動', '務', '勞', '勢', '勾', '包', '化', '區', '千', '升', '午', '半', '協', '南', '博', '危', '即', '卻', '原', '厭', '厲', '去', '參', '又', '及', '友', '反', '取', '受', '口', '句', '另', '只', '叫', '可', '史', '司', '吃', '各', '合', '同', '名', '吐', '向', '否', '吧', '呀', '告', '呢', '周', '呵', '命', '和', '哀', '品', '哈', '員', '哥', '哦', '哪', '哲', '唱', '商', '啊', '問', '啦', '善', '喔', '單', '嗎', '嘴', '嚴', '囂', '四', '回', '因', '困', '國', '圖',



In [5]:
# Train a DecisionTree
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

# 用剛剛得到的 vectorizer 萃取特徵
training_feats = vectorizer.transform(train_corpus)

clf = DecisionTreeClassifier(random_state=0)

clf.fit(training_feats, train_labels)

test_counts = vectorizer.transform(test_corpus).toarray()
y_pred = []
for kk in test_corpus:
    kk_counts = vectorizer.transform([kk]).toarray()
    y_pred.append(clf.predict(kk_counts)[0])

# 看看 預測結果 跟 標準答案 的差距
print("Accuracy on testing set: ", accuracy_score(y_pred, test_labels))

Accuracy on testing set:  0.73


In [6]:
# Inference some real samples
samples = [
    "政府實在過於無能",
    "政府很有效率",
    "阿不就好棒棒",
    "索尼罪大惡極 百姓怨聲載道",
]

for sample in samples:
    print(sample, clf.predict(vectorizer.transform([sample]).toarray()))

政府實在過於無能 ['pos']
政府很有效率 ['pos']
阿不就好棒棒 ['neg']
索尼罪大惡極 百姓怨聲載道 ['pos']


In [None]:
# Practice: Train a better classifier

# Hint 1: try new features, ex. bag-of-bigram.
# or try any other features you think of or find.
# If you have no idea:
# https://github.com/Tou7and/slp-nutshell/blob/main/text_classification/bag_of_ngrams.py

def tokenize_bigram(text):
    # Try to implement bigram
    # Hints: loop through the text
    bigrams = []
    return bigrams

vectorizer2 = CountVectorizer(tokenizer=tokenize_bigram, min_df=2)

vectorizer2.fit(train_corpus)

training_feats = vectorizer2.transform(train_corpus)


# Hint 2: try new classifiers
# You can replace the Decision Tree with any other classifiers you learned before.
# If you have no idea:
# https://scikit-learn.org/stable/auto_examples/calibration/plot_compare_calibration.html
clf2 = DecisionTreeClassifier(random_state=0) # replace with other classifier

clf2.fit(training_feats, train_labels)

test_counts = vectorizer2.transform(test_corpus).toarray()
y_pred = []
for kk in test_corpus:
    kk_counts = vectorizer2.transform([kk]).toarray()
    y_pred.append(clf2.predict(kk_counts)[0])

print("Accuracy on testing set: ", accuracy_score(y_pred, test_labels))

samples = [
    "政府實在過於無能",
    "政府很有效率",
    "阿不就好棒棒",
    "索尼罪大惡極 百姓怨聲載道",
]

print()
for sample in samples:
    print(sample, clf2.predict(vectorizer2.transform([sample]).toarray()))