# 讀取字典

In [1]:
import pandas as pd
import os

In [2]:
filepath = '/Volumes/backup_128G/z_repository/Yang_data'
read_experiment_path = '{0}/experiment-20190511T105119Z-001'.format(filepath)
read_comparison_path = '{0}/對照組_前測_逐字稿-20190511T105456Z-001'.format(filepath)

file_dic = '7000Words_20190512_v2.xlsx'
to_exp_doc = 'experiment_20190512_v1.xlsx'
to_com_doc = 'comparison_20190512_v1.xlsx'
to_level_doc = 'level_20190514_v1.xlsx'

write_exp_doc = '{0}/{1}'.format(filepath, to_exp_doc)
write_com_doc = '{0}/{1}'.format(filepath, to_com_doc)
write_level_doc = '{0}/{1}'.format(filepath, to_level_doc)
read_dic = '{0}/{1}'.format(filepath, file_dic)

In [3]:
dicDf = pd.DataFrame()

with pd.ExcelFile(read_dic) as reader:
    # read sheet by sheet
    for sheet in reader.sheet_names:
#         print(sheet)
        sheetDf = pd.read_excel(reader, sheet, header=None)
        sheetDf = sheetDf.fillna(0)

        dicDf = dicDf.append(sheetDf, ignore_index=True)

# change to lowercase
dicDf[0] = dicDf[0].str.lower()
len(dicDf.index)

6813

In [4]:
dicDf.head()

Unnamed: 0,0,1,2
0,a,art.,1
1,an,art.,1
2,abandon,v.,4
3,abbreviate,v.,6
4,abbreviation,n.,6


# 將英文字 lemmatize（詞形還原）

In [5]:
# 詞性還原 
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
wtlem = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def findWord(word, dicDataF):
    found = dicDataF.loc[dicDataF[0] == word]
    if found.empty == False:
        return found.iloc[0][2]
    return None
    
def returnWord(word, level):
    return {'lemma':word, 'level':level}

def lemmatizer(word, dicDataF):
    lowerWord = word.lower()
    
    found = findWord(lowerWord, dicDataF)
    if found != None:
        return returnWord(word, found)
    
    lemmaWord = wtlem.lemmatize(lowerWord, wordnet.ADJ)
    if lemmaWord != lowerWord:
        found = findWord(lemmaWord, dicDataF)
        if found != None:
            return returnWord(lemmaWord, found)

    lemmaWord = wtlem.lemmatize(lowerWord, wordnet.VERB)
    if lemmaWord != lowerWord:
        found = findWord(lemmaWord, dicDataF)
        if found != None:
            return returnWord(lemmaWord, found)
        
    lemmaWord = wtlem.lemmatize(lowerWord, wordnet.NOUN)
    if lemmaWord != lowerWord:
        found = findWord(lemmaWord, dicDataF)
        if found != None:
            return returnWord(lemmaWord, found)

    lemmaWord = wtlem.lemmatize(lowerWord, wordnet.ADV)
    if lemmaWord != lowerWord:
        found = findWord(lemmaWord, dicDataF)
        if found != None:
            return returnWord(lemmaWord, found)

    return returnWord(word, 9)

# 讀檔；去掉非英文字，回傳 word 字串，包含引用次數

In [6]:
# pip install python-docx 
import docx

def isAlpha(word):
    try:
        return word.encode('ascii').isalpha()
    except UnicodeEncodeError:
        return False
    
def wordCount(wordList):
    wCount = {}
    for word in wordList:
        if word not in wCount:
            wCount[word] = 1
        else:
            wCount[word] += 1
    return wCount
    
def split2Words(txt):
    wordList = []
    cleanTxt = txt.replace(',', '').replace('.', '')
    splitWords = cleanTxt.split(' ')
    for word in splitWords:
        if '’' in word or '-' in word:
            wordList.append(word)
            continue

        if isAlpha(word) == True:
            wordList.append(word)
            
    return wordCount(wordList)
    
def replaceMultiple(mainString, toBeReplaces, newString):
#     outString = copy.copy(mainString)
    # Iterate over the strings to be replaced
    for elem in toBeReplaces :
        # Check if string is in the main string
        if elem in mainString :
            # Replace the string
            mainString = mainString.replace(elem, newString)
    
    return mainString

def readtxt(filename):
    fullText = []
    doc = docx.Document(filename)
    tables = doc.tables
    for table in tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    processedPara = replaceMultiple(paragraph.text, ['…', '’', '‘', '-'] , ' ')
#                     print(processedPara)
                    fullText.append(processedPara)
#                     print(paragraph.text)

    fullText = ' '.join(fullText)
    return split2Words(fullText)

# 讀檔列表

In [7]:
def processDoc(wordList):
    resWordList = {}
    for word in wordList:
        lowerWord = word.lower()
        worddic = lemmatizer(lowerWord, dicDf)
        if worddic['lemma'] not in resWordList:
            resWordList[worddic['lemma']] = {'words':word, 'count':wordList[word], 'level':worddic['level']}
        else:
            resWordList[worddic['lemma']]['words'] += ';{0}'.format(word)
            resWordList[worddic['lemma']]['count'] += wordList[word]
            if resWordList[worddic['lemma']]['level'] != worddic['level']:
                print("ERROR!!", word, worddic)

    #     print(word, lemmatizer(word, dicDf))
    return resWordList

# Experiment

In [8]:
filename = "{0}/EXP05-post.docx".format(read_experiment_path)
print(filename)
wordList = readtxt(filename)
processedList = processDoc(wordList)

# output
outDf = pd.DataFrame(processedList).T
outDf.sort_values(by=['level', 'words'], inplace=True)

# list level words
levelDf = pd.DataFrame(columns=list('123456'))
numWords = {}
# levelWords = {}
for level in range(1, 7):
    words = outDf.loc[outDf['level'] == level].index
    numWords[level] = len(words)
#     levelWords[level] = ';'.join(words)
    
levelDf = pd.DataFrame(numWords, index=['EXP05-post-num'])
# levelDf = pd.DataFrame(levelWords, index=['EXP05-post'])
levelDf

/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP05-post.docx


Unnamed: 0,1,2,3,4,5,6
EXP05-post-num,140,26,5,1,0,1


In [9]:
levelColumns = []
levelNums = []
levelWords = []

with pd.ExcelWriter(write_exp_doc) as writer:
    for r, d, f in os.walk(read_experiment_path):
        for file in f:
            if '.docx' in file and file[0] != '.':
                sheetName = file.replace('.docx', '')

                filename = os.path.join(r, file)
                print(filename)
                wordList = readtxt(filename)
                processedList = processDoc(wordList)

                # output
                outDf = pd.DataFrame(processedList).T
                outDf.sort_values(by=['level', 'words'], inplace=True)
    #             outDf.head()
    
                # list level words
                levelColumns.append(sheetName)
        
                numWords = {}
                wordWords = {}
                for level in range(1, 7):
                    words = outDf.loc[outDf['level'] == level].index
                    numWords[level] = len(words)
                    wordWords[level] = ';'.join(words)
                
                levelNums.append(numWords)
                levelWords.append(wordWords)

                # write file
                outDf.to_excel(writer, sheetName)
                writer.save()

expLevelNumDf = pd.DataFrame(levelNums, index=levelColumns)
expLevelWordsDf = pd.DataFrame(levelWords, index=levelColumns)


/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP01-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP02-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP03-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP04-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP05-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP06-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP07-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP08-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP09-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP10-post.docx
/Volumes/backup_128G/z_repository/Yang_data/experiment-20190511T105119Z-001/EXP1

# Comparison

In [10]:
levelColumns = []
levelNums = []
levelWords = []

with pd.ExcelWriter(write_com_doc) as writer:
    for r, d, f in os.walk(read_comparison_path):
        for file in f:
            if '.docx' in file and file[0] != '.':
                sheetName = file.replace('.docx', '')

                filename = os.path.join(r, file)
                print(filename)
                wordList = readtxt(filename)
                processedList = processDoc(wordList)

                # output
                outDf = pd.DataFrame(processedList).T
                outDf.sort_values(by=['level', 'words'], inplace=True)
    #             outDf.head()
    
                # list level words
                levelColumns.append(sheetName)
            
                numWords = {}
                wordWords = {}
                for level in range(1, 7):
                    words = outDf.loc[outDf['level'] == level].index
                    numWords[level] = len(words)
                    wordWords[level] = ';'.join(words)
                
                levelNums.append(numWords)
                levelWords.append(wordWords)
                
                # write file
                outDf.to_excel(writer, sheetName)
                writer.save()
                
compLevelNumDf = pd.DataFrame(levelNums, index=levelColumns)
comLevelWordsDf = pd.DataFrame(levelWords, index=levelColumns)

/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/01余貞儀_陽翟(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/02李晟焱_陳景蘭(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/03汪鈺翔_慈湖(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/04施丞壕_小金門(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/05洪小童_慈湖(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/06張韻廷_山后(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/07曹祐瑞_山后(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/09陳怡君_陳景蘭(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/11陳羚婷_馬山(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105456Z-001/12陳琬鈞_馬山(前).docx
/Volumes/backup_128G/z_repository/Yang_data/對照組_前測_逐字稿-20190511T105

# Write level words/nums

In [11]:
# expLevelNumDf
# expLevelWordsDf
# compLevelNumDf
# comLevelWordsDf

In [12]:
with pd.ExcelWriter(write_level_doc) as writer:
    # write file
    expLevelNumDf.to_excel(writer, "EXP Num")
    expLevelWordsDf.to_excel(writer, "EXP Words")
    compLevelNumDf.to_excel(writer, "COMP Num")
    comLevelWordsDf.to_excel(writer, "COMP Words")
    
    writer.save()