In [1]:
import pandas as pd
from xlrd import XLRDError
from tqdm import tqdm
from w3lib.html import remove_tags
import xlrd
import re
import os

zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
casePattern = re.compile(u'[A-Z]')

# 读取文件的规则，如果xlrd能打开就优先使用xlrd读取，如果xlrd读取失败，则说明是文本文件，优先用csv读取
def open_file(filename):
    try:
        data = xlrd.open_workbook(filename)
        return data
    except XLRDError:
        return html_csv_reader(filename)
    
def clean_str(text):
    reg = "[^A-Za-z\u4e00-\u9fa5]"
    return re.sub(reg, '', text)
            
def html_csv_reader(filename):
    try:
        with open(filename, encoding="utf-8") as fin:
            html = ""
            for i in range(5):
                html += fin.readline().strip()
            html = remove_tags(html)
            return clean_str(html)
    except UnicodeError:
        print("文件无法打开："+filename)
    
def data_handler(data, truncate_length=500):
    if isinstance(data, xlrd.book.Book):
        return xls_data_reader(data)[:truncate_length]
    elif isinstance(data, pd.DataFrame):
        ans = (data.iloc[:5]).values.tolist()
        res = []
        for x in ans:
            res.append(" ".join([str(_) for _ in x]))
        return " ".join(res)
    else:
        return data[:truncate_length]
        
def xls_data_reader(data):
    # 首先判断要读取哪个表格，优先级：中文名>默认名(Sheet1)>第一个索引
    sheet_name = ""
    sheet_index = 0
    for name in data.sheet_names():
        if zhPattern.search(name):
            sheet_name = name
    if sheet_name == "" and "Sheet1" in data.sheet_names():
        sheet_name = "Sheet1"
    if sheet_name != "":
        table = data.sheet_by_name(sheet_name)
    else:
        table = data.sheet_by_index(sheet_index)
    # 如果有效行数和列数为0，就换一张表，直到表中有数据为止，如果表读完了发现都没有数据，那么就返回空
    while table.nrows == 0:
        try:
            table = data.sheet_by_index(sheet_index+1)
        except IndexError:
            return ''
    # 取前面五行数据
    res = []
    for i in range(min(table.nrows, 5)):
        content = " ".join([str(_) for _ in table.row_values(i)])
        res.append(content)

    return clean_str(" ".join(res))

def get_file_content(path, filename):
    return data_handler(open_file(os.path.join(path, filename)))

In [7]:
class Config:
    def __init__(self):
        self.train_path = "data/train"
        self.test_path = "data/test2"
        self.save_train = "model/train.csv"
        self.save_test = "model/test.csv"
        self.train_filename = os.listdir(self.train_path)
        self.test_filename = os.listdir(self.test_path)
        self.train = []
        self.test = []
        
    def get_data(self, is_train = True):
        if is_train:
            P, F = self.train_path, self.train_filename
        else:
            P, F = self.test_path, self.test_filename
        with tqdm(total = len(F), ncols=80) as pbar:
            for i, filename in enumerate(F):
                pbar.update(1)
                content = get_file_content(P, filename)
                self.train.append([str(i), filename, content])
                
    def save_file(self):
        for S, C in zip([self.save_train, self.save_test], [self.train, self.test]):
            tqdm.write("保存文件到:{}".format(S))
            with open(S, 'w', encoding="utf-8") as fin:
                for content in tqdm(C):
                    try:
                        fin.write("\t".join(content))
                    except TypeError:
                        print(content)
                    fin.write("\n")



In [8]:
config = Config()
config.get_data()
config.get_data(is_train=False)

100%|█████████████████████████████████████| 60000/60000 [12:34<00:00, 79.48it/s]
100%|█████████████████████████████████████| 25459/25459 [04:48<00:00, 88.33it/s]


In [9]:
# 代码写错了，保存到同一个文件里面了，都到train.csv里了
config.save_file()

 23%|█████████████████████████████████▋                                                                                                                 | 19593/85459 [00:00<00:00, 171451.28it/s]

保存文件到:model/train.csv


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85459/85459 [00:00<00:00, 177072.71it/s]
0it [00:00, ?it/s]

保存文件到:model/test.csv



