In [1]:
import sys
from collections import Counter

In [2]:
# Read the text file and do data cleaning
train_text_file = "./train.txt"

def read_and_clean(text_file):
    """ Read and clean Mandarin data.
        Treat each character as a word.
        Remove unwant symbols using replace.
    
    Args:
        text_file(str): the file path of the text file.
    
    Returns:
        words(str): Mandarins without unwanted symbols.
    """
    with open(text_file, 'r') as reader:
        words = reader.read()
    words = words.replace("\n", "")
    words = words.replace(" ", "")
    return words

words = read_and_clean(train_text_file)
print(words[:20])
print(words[20:40])
print(words[40:60])

自從俄羅斯入侵烏克蘭開始台灣海峽的安全問
題再次成為外界關注焦點烏克蘭危機加劇之際
台灣社會及輿論也就台灣的軍事戰略及訓練方


In [3]:
def get_unigram(words, norm=True):
    """ Use A counter to store the unigrams. 
    
    Args:
        words(str)
        norm(bol)
    """
    unigram_counts = Counter()
    n_words = len(words)

    for word in words:
        unigram_counts[word] += 1
    
    # normalization
    if norm:
        for k, w in unigram_counts.most_common():
            unigram_counts[k] = float(w/n_words)
    return unigram_counts

In [28]:
unigrams = get_unigram(words)

for k, w in unigram.most_common(10):
    print(k, w)

台 0.034139402560455195
灣 0.034139402560455195
的 0.029871977240398292
國 0.02702702702702703
戰 0.02275960170697013
軍 0.01991465149359886
不 0.01849217638691323
對 0.017069701280227598
稱 0.01422475106685633
防 0.012802275960170697


In [4]:
def get_bigram(words, norm=True):
    """ Use A Counter to Store the bigram model. """
    bigram_counts = Counter()
    n_words = len(words)
    bigram_list = []

    for idx in range(n_words-2+1):
        bigram_list.append(words[idx:idx+2])

    n_bigram = len(bigram_list)

    for bigram in bigram_list:
        bigram_counts[bigram] += 1

    if norm:
        for k, w in bigram_counts.most_common():
            bigram_counts[k] = float(w/n_bigram)
    return bigram_counts

In [9]:
bigrams = get_bigram(words, norm=False)

for k, w in bigrams.most_common(10):
    print(k, w)

context = "軍"
print()
print("Context: {}".format("軍"))
for k, w in bigrams.most_common():
    if k[0] == context:
        print(k, w)

台灣 24
不對 9
對稱 9
烏克 6
克蘭 6
軍事 6
稱作 6
作戰 6
國防 5
成為 4

Context: 軍
軍事 6
軍力 1
軍新 1
軍艦 1
軍之 1
軍售 1
軍構 1
軍隊 1
軍思 1


In [8]:
# Implement Trigram, 
# and print the top 10 trigrams of the training corpus.
def get_trigram(words, norm=True):
    trigram_counts = Counter()
    return trigram_counts