### Baseline

Classifier: KNN

In [None]:
import os, sys, glob, argparse
from PIL import Image
import cv2
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [None]:
# Data Loading
train_path = glob.glob('./data/train/*')
test_path = glob.glob('./data/test/*')

train_path.sort()
test_path.sort()

train_df = pd.read_csv('data/train.csv')
train_df = train_df.sort_values(by='name')
train_label = train_df['label'].values

In [None]:
# Feature Engineering
def image_feat(path):
    img = cv2.imread(path, 0)
    img = img.astype(np.float32)
    feat = [
        (img != 0).sum(),              # 非零像素的数量
        (img == 0).sum(),              # 零像素的数量
        img.mean(),                    # 平均值
        img.std(),                     # 标准差
        len(np.where(img.mean(0))[0]), # 在列方向上平均值不为零的数量
        len(np.where(img.mean(1))[0]), # 在行方向上平均值不为零的数量
        img.mean(0).max(),             # 列方向上的最大平均值
        img.mean(1).max()              # 行方向上的最大平均值
    ]
    return feat

In [None]:
# Train & Evaluation
# 训练集特征
train_feat = []
for path in tqdm(train_path):
    train_feat += [image_feat(path)]

# 测试集特征
test_feat = []
for path in tqdm(test_path):
    test_feat += [image_feat(path)]
    
# 训练集交叉验证
train_pred = cross_val_predict(
    KNeighborsClassifier(),
    np.array(train_feat),
    train_label
)
print(classification_report(train_label, train_pred))

# 模型训练与预测
model = KNeighborsClassifier()
model.fit(
    np.array(train_feat),
    train_label
)
test_pred = model.predict(np.array(test_feat))

In [None]:
# Output
submit = pd.DataFrame(
    {
        'name': [x.split('/')[-1] for x in test_path],
        'label': test_pred
})
submit = submit.sort_values(by='name')
submit.to_csv('submit.csv', index=None)