In [1]:
"""将原始的340个csv随机分成100份,每一份里面样本类别各异,方便后续模型读取"""
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import ast
import os
import datetime as dt
from tqdm import tqdm
import pandas as pd
import numpy as np

NCSVS = 100
INPUT_PATH = '../input'

In [2]:
def f2cat(filename: str) -> str:
    """从路径获得文件名，也即类别名，保证输入输出都是string"""
    return filename.split('.')[0]

class Simplified():
    """定义一个读取simpled数据集的类"""
    def __init__(self, input_path=INPUT_PATH):
        self.input_path = input_path

    def list_all_categories(self):
        """返回排好序的类别名列表，排序的原因是为了将类别名与 0~339 一一对应起来"""
        files = os.listdir(os.path.join(self.input_path, 'train_csv'))
        return sorted([f2cat(f) for f in files], key=str.lower) # 类别名排序

    def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False, drop_unrecognized=False):
        df = pd.read_csv(os.path.join(self.input_path, 'train_csv', category + '.csv'),
                         nrows=nrows, parse_dates=['timestamp'], usecols=usecols)
        """
        读取某一个类别的csv文件
        drawing_transform: 是否将string型的数据转换成list
        drop_unrecognized: 是否丢弃掉unrecognized的样本，推荐保持false
        """
        # 实验发现丢弃掉unrecognized样本准确率会下降，所以保留
        if drop_unrecognized:
            print("Drop unrecognized samples!!!")
            df = df[df['recognized'] == True]
        if drawing_transform:
            df['drawing'] = df['drawing'].apply(ast.literal_eval)
        return df

In [3]:
start = dt.datetime.now()
s = Simplified(INPUT_PATH)
categories = s.list_all_categories()
print(len(categories))

340


In [4]:
for y, cat in tqdm(enumerate(categories)): # 共340个类别
    df = s.read_training_csv(cat) # df就为当前类别的csv
    df['y'] = y # y为 0~339 的数字，相当于对类别进行了LabelEncode
    df['cv'] = (df.key_id // 10 ** 7) % NCSVS  # NCSVS = 100, cv决定了应该放在哪一个文件中
    for k in range(NCSVS):
        filename = INPUT_PATH + '/shuffled_csv/train_%d_%d.csv'%(k+1, NCSVS)
        chunk = df[df.cv == k] # 得到df中cv=k的样本，应该存放在当前文件中
        chunk = chunk.drop(['key_id'], axis=1)
        if y == 0: # 新建文件
            chunk.to_csv(filename, index=False)
        else: # mode='a': 附加写 方式打开文件
            chunk.to_csv(filename, mode='a', header=False, index=False) 

340it [30:14,  5.34s/it]


In [5]:
for k in tqdm(range(NCSVS)):
    filename = INPUT_PATH + '/shuffled_csv/train_%d_%d.csv'%(k+1, NCSVS)
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['rnd'] = np.random.rand(len(df)) # 给每个样本一个随机数
        df = df.sort_values(by='rnd').drop('rnd', axis=1)
        df.to_csv(filename + '.gz', compression='gzip', index=False) # 以压缩的方式存储csv
        os.remove(filename)
print(df.shape)

100%|██████████| 100/100 [4:12:05<00:00, 151.25s/it] 

(497740, 7)





In [6]:
end = dt.datetime.now()
print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))

Latest run 2018-10-21 20:37:03.221214.
Total time 16939s
