In [71]:
import pdfplumber
import os
import re
import pandas as pd
import sys
import datetime
import calendar
from tqdm import tqdm
import win32com
import win32com.client
import docx
import numpy as np

In [56]:
dirPath = os.getcwd()+'\\announcements\\wens\\notes\\'
fileList = os.listdir(dirPath)

In [15]:
def doc2docx(path):
    w = win32com.client.Dispatch('Word.Application')
    w.Visible = 0
    w.DisplayAlerts = 0
    doc = w.Documents.Open(path)
    newpath = os.path.splitext(path)[0] + '.docx'
    doc.SaveAs(newpath, 12, False, "", True, "", False, False, False, False)
    doc.Close()
    w.Quit()
    os.remove(path)
    return newpath 

In [18]:
# change all doc to docx
for file in tqdm(fileList):
    if file[-3:] == 'DOC' or file[-3:] == 'doc':
        doc2docx(dirPath+file)
fileList = os.listdir(dirPath)

100%|████████████████████████████████████████████████████████████████████████████████| 198/198 [02:45<00:00,  1.20it/s]


In [20]:
def to_dateTime(dateText):
    dateData = re.findall('(.*?)年(.*?)月(.*?)日',dateText)[0]
    dateTime = datetime.datetime(int(dateData[0]), int(dateData[1]), int(dateData[2]))
    return dateTime

In [21]:
def get_noteData(fullText, noteDate):
    fullText = '\n'+fullText
    noteData = []
    qsData=[]
    for text in re.split('\n(.*?？)\n',fullText)[1:]:
        if re.match('.*?？$',text):
            qsData = [noteDate,text]
        else:
            qsData.append(text)
            noteData.append(qsData)
    return noteData

In [22]:
def read_note_docx(filePath, threshold=100):
    doc = docx.Document(filePath)
    tables = doc.tables
    
    fullText = ''
    for table in tables:
        for i in range(len(table.rows)):
            for j in range(len(table.columns)):
                cellText = table.cell(i,j).text 
                if(len(cellText)>threshold):
                    noteText = cellText
                date = re.match('\d+年\d+月\d+日$',cellText)
                if date:
                    noteDate = to_dateTime(date.group())

    return get_noteData(noteText, noteDate)

In [45]:
def read_note_pdf(filePath):
    with pdfplumber.open(filePath) as pdf:
        fullText = ''
        for pageNum in range(len(pdf.pages)):
            fullText = fullText + pdf.pages[pageNum].extract_text()
        noteDate = to_dateTime(re.findall('\d+年\d+月\d+日',filePath)[0])
        noteText = re.findall('姓名.*?\n(.*?？.*?)附件清单',fullText,re.S)[0]
        
        return get_noteData(noteText, noteDate)

In [24]:
def read_note_pdf_p5w(filePath):
    with pdfplumber.open(filePath) as pdf:
        noteDate = to_dateTime(re.findall('\d+年\d+月\d+日',filePath)[0])
        
        noteDataRaw = []
        for pageNum in range(0,len(pdf.pages)):
            for table in pdf.pages[pageNum].extract_tables():
                noteDataRaw = noteDataRaw + table[1:]
        
        noteData = []
        i = 0
        while i < len(noteDataRaw):
            if noteDataRaw[i][0] != '':
                noteData.append([noteDate,noteDataRaw[i][1],noteDataRaw[i][2]])
            else:
                noteData.pop()
                noteData.append([noteDate,noteDataRaw[i-1][1]+noteDataRaw[i][1],noteDataRaw[i-1][2]+noteDataRaw[i][2]])
            i+=1
           
        return noteData

In [61]:
def read_note(filePath):
    if 'doc' in filePath[-10:].lower():
        return read_note_docx(filePath)
    elif 'pdf'in filePath[-10:].lower():
        if '路演' in filePath:
            return read_note_pdf_p5w(filePath)
        else:
            return read_note_pdf(filePath)
    else:
        print('File type error: Should be either docx or pdf.')
        return ['','','']

In [57]:
noteData = []
for file in tqdm(fileList):
    filePath = dirPath + file
    try:
        noteData = noteData + read_note(filePath)
    except:
        print(filePath)

 68%|██████████████████████████████████████████████████████                          | 133/197 [00:03<00:01, 37.42it/s]

File type error: Should be either docx or pdf.


100%|████████████████████████████████████████████████████████████████████████████████| 197/197 [01:12<00:00,  2.71it/s]


In [73]:
for note in noteData:
    if note == '':
        note = ['','','']
    note[1] = note[1].replace('\n','')
    note[2] = note[2].replace('\n','')

tag = ['date', 'question', 'answer']
noteDf = pd.DataFrame(noteData, columns=tag)
noteDf.set_index('date', inplace = True)
noteDf.sort_index(inplace=True)
noteDf = noteDf.replace(to_replace='None', value=np.nan).dropna()

In [75]:
outputPath = 'wens_notes.csv'
noteDf.to_csv(outputPath,sep=',',index=True,header=True, encoding='utf_8_sig') 