In [1]:
import pandas as pd
import os

In [2]:
basePath = '/Volumes/backup_128G/z_repository/Yang_data/學測指考英文'
dataPath = 'data_docx'
outputPath = 'output'

from_dataPath = '{0}/{1}'.format(basePath, dataPath)
to_dataPath = '{0}/{1}'.format(basePath, outputPath)

# Read dictionary for level

In [3]:
file_dic = 'permantData/7000Words_20190512_v2.xlsx'
read_dic = '{0}/{1}'.format(basePath, file_dic)

dicDf = pd.DataFrame()

with pd.ExcelFile(read_dic) as reader:
    # read sheet by sheet
    for sheet in reader.sheet_names:
#         print(sheet)
        sheetDf = pd.read_excel(reader, sheet, header=None)
        sheetDf = sheetDf.fillna(0)

        dicDf = dicDf.append(sheetDf, ignore_index=True)

# change to lowercase
dicDf[0] = dicDf[0].str.lower()
len(dicDf.index)

6813

In [4]:
dicDf.head()

Unnamed: 0,0,1,2
0,a,art.,1
1,an,art.,1
2,abandon,v.,4
3,abbreviate,v.,6
4,abbreviation,n.,6


# Only care of levels 5 and 6 as high level

In [5]:
highlevels = [2, 3, 4, 5, 6]
dicHighLevel_TF = dicDf[2].isin(highlevels)
dicHighLevel = dicDf[dicHighLevel_TF]
dicHighLevel.head()

Unnamed: 0,0,1,2
2,abandon,v.,4
3,abbreviate,v.,6
4,abbreviation,n.,6
5,abdomen,n.,4
6,abide,v.,5


# 將英文字 lemmatize（詞形還原）

In [6]:
# 詞性還原 
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
wtlem = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def findWord(word, dicDataF):
    found = dicDataF.loc[dicDataF[0] == word]
    if found.empty == False:
        return found.iloc[0][2]
    return None
    
def returnWord(word, level):
    return {'lemma':word, 'level':level}

def lemmatizer(word, dicDataF):
    lowerWord = word.lower()
    
    found = findWord(lowerWord, dicDataF)
    if found != None:
        return returnWord(word, found)
    
    lemmaWord = wtlem.lemmatize(lowerWord, wordnet.ADJ)
    if lemmaWord != lowerWord:
        found = findWord(lemmaWord, dicDataF)
        if found != None:
            return returnWord(lemmaWord, found)

    lemmaWord = wtlem.lemmatize(lowerWord, wordnet.VERB)
    if lemmaWord != lowerWord:
        found = findWord(lemmaWord, dicDataF)
        if found != None:
            return returnWord(lemmaWord, found)
        
    lemmaWord = wtlem.lemmatize(lowerWord, wordnet.NOUN)
    if lemmaWord != lowerWord:
        found = findWord(lemmaWord, dicDataF)
        if found != None:
            return returnWord(lemmaWord, found)

    lemmaWord = wtlem.lemmatize(lowerWord, wordnet.ADV)
    if lemmaWord != lowerWord:
        found = findWord(lemmaWord, dicDataF)
        if found != None:
            return returnWord(lemmaWord, found)

    return returnWord(word, 9)

# 讀檔；去掉非英文字，回傳 word 字串，包含引用次數

In [7]:
# pip install python-docx 
import docx

def isAlpha(word):
    try:
        return word.encode('ascii').isalpha()
    except UnicodeEncodeError:
        return False
    
def isEnglish(s):
    return s.isalpha()
    
def wordCount(wordList):
    wCount = {}
    for word in wordList:
        if word not in wCount:
            wCount[word] = 1
        else:
            wCount[word] += 1
    return wCount
    
def split2Words(txt):
    wordList = []
#     cleanTxt = txt.replace(',', '').replace('.', '')
    splitWords = txt.split()
#     print(splitWords)
#     splitWords = [x for x in splitWords if ' ' not in x]
    for word in splitWords:
#         if '’' in word or '-' in word:
#             wordList.append(word)
#             continue

        if isAlpha(word) == True:
            wordList.append(word)
        elif word != '':
            print(word, ' includes un-alpha characters.')
            
    return wordCount(wordList)
    
def replaceMultiple(mainString, toBeReplaces, newString):
#     outString = copy.copy(mainString)
    # Iterate over the strings to be replaced
    for elem in toBeReplaces :
        # Check if string is in the main string
        if elem in mainString :
            # Replace the string
            mainString = mainString.replace(elem, newString)
    
    return mainString

def readTxt(filename):
    fullText = []
    # #VINCENT#, 20190917, To highlight the high level words
    plainTxt = []
    doc = docx.Document(filename)
    wordCnt = 0
    for paragraph in doc.paragraphs:
#         print(wordCnt, paragraph.text)
        if len(paragraph.text.replace(' ', '')) == 0:
            continue
        wordCnt += len(paragraph.text.split(' '))
        
        # #VINCENT#, 20190917, To highlight the high level words
        plainTxt.append(paragraph.text)
        processedPara = replaceMultiple(paragraph.text,
                                        ['…', '’', '‘', '\'', '-', '—', '－', '”', '“', '/',
                                         ';', '!', '.', '?', ',', ':', ':', '(', ')', '–'] , ' ')
        if len(processedPara) == 0:
            continue
        fullText.append(processedPara)

#     print(wordCnt)
    fullText = ' '.join(fullText)
    # #VINCENT#, 20190917, To highlight the high level words
    return wordCnt, plainTxt, split2Words(fullText)

# 讀檔列表

In [8]:
def processDoc(wordList):
    resWordList = []
    for word in wordList:
        lowerWord = word.lower()
        worddic = lemmatizer(lowerWord, dicDf)
        
        # only care of high level words
        if worddic['level'] not in highlevels:
            continue

        if word not in resWordList:
            resWordList.append(word)

    #     print(word, lemmatizer(word, dicDf))
    return resWordList

# Parse Directory

In [18]:
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_COLOR_INDEX
from docx.enum.dml import MSO_THEME_COLOR
from docx.shared import RGBColor

fontColorMap = [WD_COLOR_INDEX.YELLOW, WD_COLOR_INDEX.YELLOW, WD_COLOR_INDEX.RED, 
            WD_COLOR_INDEX.GREEN, WD_COLOR_INDEX.BLUE, WD_COLOR_INDEX.VIOLET, 
            WD_COLOR_INDEX.DARK_RED]

# colorMap = [MSO_THEME_COLOR.ACCENT_6, MSO_THEME_COLOR.ACCENT_4, MSO_THEME_COLOR.ACCENT_5, 
#             MSO_THEME_COLOR.ACCENT_6, MSO_THEME_COLOR.ACCENT_3, MSO_THEME_COLOR.ACCENT_2, 
#             MSO_THEME_COLOR.ACCENT_1]

colorMap = [RGBColor(0xff, 0x00, 0x00), RGBColor(0xff, 0xcc, 0x99), RGBColor(0xf0, 0x00, 0xf0), 
            RGBColor(0x00, 0x33, 0xff), RGBColor(0xff, 0xa5, 0x00), RGBColor(0xff, 0x00, 0x00),
            RGBColor(0x40, 0xf0, 0x40)]

def splitTxt(arrTxt, key):
    newArrTxt = []
    for sent in arrTxt:
        if key in sent:
            arrSplit = sent.split(key)

#             print(key, arrSplit)
            # The case: kidnap maps in [kidnap]ping and [kidnap]pers.
            if arrSplit[0] != '' and len(arrSplit) == 2:
                if len(arrSplit[1]) == 0:
                    newArrTxt.append(sent)
                    continue

                if isEnglish(arrSplit[0][-1]) or isEnglish(arrSplit[1][0]):
                    newArrTxt.append(sent)
                    continue
            newArrTxt.append(arrSplit[0])
            for idx in range(1, len(arrSplit)):
                if isEnglish(arrSplit[idx][0]):
                    newArrTxt.append(key + arrSplit[idx])
                    continue
                newArrTxt.append(key)
                newArrTxt.append(arrSplit[idx])
        else:
            newArrTxt.append(sent)
    return newArrTxt
    
def highlight_run(plainTxt, wordList, outputFile):
    arrPara = []
#     print(plainTxt)
    for paraTxt in plainTxt:
        arrTxt = [paraTxt]
        for hword in wordList:
            arrTxt = splitTxt(arrTxt, hword)
#             print(arrTxt)
        arrPara.append(arrTxt)
        
#     print(arrPara)
    document = Document()
    
    tableItems = {}
    highlightCnt = 1
    for para in arrPara:
        docPara = document.add_paragraph("")

        for partTxt in para:
            if partTxt in wordList:
                # get the level of the word
                lowerWord = partTxt.lower()
                worddic = lemmatizer(lowerWord, dicDf)
                level = worddic['level']
                if worddic['lemma'] not in tableItems:
                    tableItems[worddic['lemma']] = level
                    
                highlightCnt += 1
#                 print(partTxt, level)
    
#                 docPara.add_run(partTxt).font.highlight_color = colorMap[level]
                run = docPara.add_run(partTxt)
                run.font.color.rgb = colorMap[level]
                if level > 4:
                    run.font.highlight_color = WD_COLOR_INDEX.YELLOW
            
            else:
                docPara.add_run(partTxt)

    # add table ------------------
    table = document.add_table(1, 4)

    # populate header row --------
    heading_cells = table.rows[0].cells
    heading_cells[0].text = ''
    heading_cells[1].text = 'vocab'
    heading_cells[2].text = 'level'
    heading_cells[3].text = 'Chinese'

    # sort table items
    sortedItems = sorted(tableItems.items(), key=lambda kv: kv[1], reverse=True)
#     print(sortedItems)
    
    # add a data row for each item
    # ('accord', {'idx': 1, 'level': 6})
    idx = 1
    for vocab, level in sortedItems:
#         print(vocab)
        cells = table.add_row().cells
        cells[0].text = str(idx)

        run1 = cells[1].paragraphs[0].add_run(vocab)
        run1.font.color.rgb = colorMap[level]
        if level > 4:
            run1.font.highlight_color = WD_COLOR_INDEX.YELLOW
            
        cells[2].paragraphs[0].add_run(str(level)).font.color.rgb = colorMap[level]
        cells[3].text = ''
    
        idx += 1
    table.style = 'Table Grid'
    
    print(highlightCnt)
    document.save(outputFile)

In [19]:
def parseDir(readDir, writeDir):
    for root, dirs, files in os.walk(readDir):
        for file in sorted(files):
            # '.filename.docx': temporary file in Mac
            # '~': The file could be temporary opened.
            if '.docx' in file and file[0] != '.' and file[0] != '~':
                # create new folder if not exist
                createDir = '{0}/{1}'.format(writeDir, root[root.rfind('/')+1:])
                if not os.path.exists(createDir):
                    os.makedirs(createDir)

                readFile = os.path.join(root, file)
                print(readFile)
                writeFile = os.path.join(createDir, file)
#                 print(writeFile)

                # #VINCENT#, 20190917, To highlight the high level words
                wordCnt, plainTxt, wordList = readTxt(readFile)
                processedList = processDoc(wordList)
                highlight_run(plainTxt, processedList, writeFile)

In [20]:
parseDir('/Volumes/backup_128G/z_repository/Yang_data/學測指考英文/test', 
         '/Volumes/backup_128G/z_repository/Yang_data/學測指考英文/testOutput')

# parseDir(from_dataPath, to_dataPath)

/Volumes/backup_128G/z_repository/Yang_data/學測指考英文/test/aaa/A-099-Q16to20-M清.docx
88%  includes un-alpha characters.
35
