In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image

def findfiles(path, one_level = 0):  #遍历文件夹
    result = []
    # 首先遍历当前目录所有文件及文件夹
    file_list = os.listdir(path)
    # 循环判断每个元素是否是文件夹还是文件，是文件夹的话，递归
    for file in file_list:
    	# 利用os.path.join()方法取得路径全名，并存入cur_path变量，否则每次只能遍历一层目录
        cur_path = os.path.join(path, file)
        # 判断是否是文件夹
        if os.path.isdir(cur_path):
            result.append(file)

    if ('CSV' in result) == True:
        result.remove('CSV')
    if ('.DS_Store' in result) == True:
        result.remove('.DS_Store')
    if one_level == 0:
        result = sorted(result, key = lambda x: int(x.split(' ')[0])) #排序一下再输出
    elif one_level == 1:
        result = sorted(result)
    return result

def getFileList(dir,Filelist, ext=None): #遍历文件夹下的图片
    """
    获取文件夹及其子文件夹中文件列表
    输入 dir：文件夹根目录
    输入 ext: 扩展名
    返回： 文件路径列表
    """
    Filelist = os.listdir(dir)
    if ext in dir[-3:]:
        Filelist.append(dir)
        
    Filelist = sorted(Filelist, key = lambda x: int(x.split('.')[0]))
    return Filelist

In [30]:
# 读取数据集数据放进csv
path = './Hijja2-master'  # 主路径
letter_file = findfiles(path, one_level = 0)
# print(letter_file)

save_csv = []
label_record = 0

for i in range(len(letter_file)):
    letter_name = letter_file[i]
    letter_path = path + '/' + letter_name
    
    letter_small_file = findfiles(letter_path, one_level = 1)
    
    for j in range(len(letter_small_file)):
        letter_small_name = letter_small_file[j]
        letter_small_path = letter_path + '/' + letter_small_name
        
        # print(letter_small_path)
        
        img_list = getFileList(letter_small_path, [], ext='png')

        # print(img_list)
        
        for k in range(len(img_list)):
            img_name = img_list[k]
            img_path = letter_small_path + '/' + img_name # 每张图片路径

            # 将图片转化成32x32矩阵，然后成1x1024向量
            img = Image.open(img_path).convert('L')
            img = img.resize((32, 32))
            img_vector = (1 - np.array(img)/255).reshape(1024,) # 0表示黑，1表示白
            # img_vector = np.round((1 - np.array(img)/255),1).reshape(1024,) # 0表示黑，1表示白
            
            
            head = np.array([letter_name, letter_small_name, img_name, label_record]).reshape(-1,)
            final_vector = np.concatenate([head, img_vector])
            save_csv.append(final_vector)
    
        label_record += 1


# 输出csv
title = ['Primary folder', 'Secondary folder', 'file name', 'label'] 
for i in range(1024):
    title = title + ['pixel' + str(i)]

save_csv_dataframe = pd.DataFrame(save_csv, columns =title)
save_csv_dataframe.to_csv('./save_csv_dataframe.csv', index = True, header = True)


# 保存一个txt文件，看看每个label有几个样本  
count = save_csv_dataframe['label'].value_counts(sort=False)
print(count)
count.to_csv('./count.txt')
# f=open("./counts.txt","w")
# f.writelines(str(count))
# f.close()

0      456
1      443
2      460
3      460
4      464
      ... 
103    433
104    425
105    430
106    426
107    427
Name: label, Length: 108, dtype: int64


In [17]:
# 将上面得到的csv数据集按照每个label 8：2划分训练集和测试集
data_all = pd.read_csv('./save_csv_dataframe.csv')
split_rate = 0.8

np.random.seed(101) #设置随机种子
list_label = list(np.unique(data_all[['Secondary folder']]))

In [None]:
train_temp = []
test_temp = []

for i in list_label:
    data_secondary = data_all[(data_all['Secondary folder'] == i)]
    
    shuffled_index = np.random.permutation(len(data_secondary)) # 得到一个无序数组
    split_index = int(len(data_secondary)*split_rate)   # 获取划分边界
    
    # 得到index
    train_index = shuffled_index[:split_index]
    test_index = shuffled_index[split_index:]
    # 得到各自划分后的数据
    train_secondary = np.array(data_secondary.iloc[train_index])
    test_secondary = np.array(data_secondary.iloc[test_index])

    train_temp.extend(train_secondary)
    test_temp.extend(test_secondary)

    

# # 输出csv
title = ['raw_index', 'Primary folder', 'Secondary folder', 'file name', 'label'] 
for i in range(1024):
    title = title + ['pixel' + str(i)]
# # 新建两个csv文件来保存
# train_file = pd.DataFrame(), columns =title)
# train_file.to_csv('./train.csv', index = True, header = True)

# test_file = pd.DataFrame(np.squeeze(test_temp, axis=(0,)), columns =title)
# test_file.to_csv('./test.csv', index = True, header = True) 

train_file = pd.DataFrame(train_temp, columns =title)
train_file.to_csv('./train.csv', index = True, header = True)

test_file = pd.DataFrame(test_temp, columns =title)
test_file.to_csv('./test.csv', index = True, header = True) 

In [29]:
# 看一下第n行数据是否正确
data_all = pd.read_csv('./save_csv_dataframe.csv')
n = 1444

img_matrix = np.array(list(data_all.iloc[n,:])[-1024:]).reshape([32,32])

import cv2
cv2.imshow('img', img_matrix)
cv2.waitKey(0)
cv2.destroyAllWindows()
