# PIQ2: 分割数据集

使用sklearn的[ShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit):

- `n_splits` : 分割份数
- `test_size`: 测试级数量，取0~1.0之间的小数，代表测试数据集占整个数据集的比例；如取整数，代表测试数据集数量绝对值

In [1]:
from sklearn.model_selection import ShuffleSplit, train_test_split
import pandas as pd
import numpy as np

In [2]:
def dataset_split(ds, n=1, test_size=0.1):
    rs = ShuffleSplit(n_splits=n, test_size=test_size, random_state=1)
    splits = rs.split(ds)
    train_test = list()
    for train_index, test_index in splits:
        train_test.append([list(train_index), list(test_index)])
    if len(train_test) == 1:
        return train_test[0]
    return train_test

In [3]:
dataset = pd.read_csv("data/piq_imfp.csv")

In [4]:
train_index, test_index = dataset_split(dataset, test_size=1000)
print(len(train_index), len(test_index))

train_dataset = dataset.loc[train_index]
test_dataset = dataset.loc[test_index]

train_dataset['index'] = train_dataset.index
test_dataset['index'] = test_dataset.index

train_dataset.set_index('index').to_csv('data/train_dataset.csv')
test_dataset.set_index('index').to_csv('data/test_dataset.csv')

112891 1000


In [5]:
test_data = pd.read_csv('data/test_dataset.csv')

test_data.head()

Unnamed: 0,index,path,fp_long
0,14093,images/train/卧室_北欧极简/505286_2047519.jpg,326ca4f60d66da4a9bb79903e8fc750ec9ffb1812c65da...
1,57873,images/train/客厅_北欧极简/1091110_5688185.jpg,2e44e8cf5576fec855a67502497dbf28d8d497fbacb27d...
2,59511,images/train/客厅_北欧极简/1767017_9640918.jpg,e9b0bff86ced6957eb59745cbf9fd03b8da37e1c937ef2...
3,97351,images/train/餐厅_中式现代/631144_2825936.jpg,7d7af8e3e71df1d06bfdb44f9e19dfbeaec05f5dd77be0...
4,68230,images/train/客厅_欧式豪华/516645_2118719.jpg,69743d7d703e7d37794eee08ff3701b995255d2ca5e0f1...


In [6]:
train_data = pd.read_csv('data/train_dataset.csv')

train_data.head()

Unnamed: 0,index,path,fp_long
0,47103,images/train/厨房_北欧极简/936548_4716642.jpg,1b02eff1269ee910ed4957d69d89b268622d3f1d81bc25...
1,54278,images/train/客厅_中式传统/610905_2702270.jpg,70dd7b44924a5527708cae283337047122b44da0680ba1...
2,32653,images/train/卧室_美式田园/814946_3957933.jpg,b7cf879d9437cef857ead2466160bde559cc3eefedf75f...
3,71726,images/train/客厅_现代简约/1330683_7057273.jpg,89707757669f70346de8766cbe1084b5a6344628a1bc30...
4,10786,images/train/卧室_中式现代/1127432_5908911.jpg,39df63dc1e9147ffc54cdce633acb0e46eba9649cb2b26...


In [7]:
len(test_data), len(train_data)

(1000, 112891)