### England  Germany Spain  Argentina France  5국가 분류

### 1. PCA + SVC


In [1]:
import cv2
import numpy as np
import sys
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import os
import pandas as pd

In [7]:
def read_data(fin):
    """ 이미지 파일을 읽어 들여 이미지 데이터, 국가를 리턴"""
    data = pd.read_csv(fin)
    target_li=[]
    data_li=[]
    for i in range(len(data)):
        image_id = int(data.iloc[i][0])
        target_nation = data.iloc[i][3]
        working_dir = fin.split('/')[:-1]
        image_path = '/'.join(working_dir)+'/valid_pictures/'+ target_nation +'/{}.png'.format(image_id)
       
        if (os.path.isfile(image_path)):
            image_data = cv2.imread(image_path, 0)
            data_li.append(image_data)
            target_li.append(target_nation)

    return (np.array(data_li), np.array(target_li))

In [8]:
def create_train_test_data(image_data, label_li):
    #데이터 수, 세로 픽셀, 가로 픽셀 
    n_samples, image_h, image_w = image_data.shape

    X = image_data.reshape(n_samples, -1)
 
    n_features = X.shape[1]
    y = label_li
    #클래스 갯수 = 분류할 국가 수
    n_classes = 5
    
    print("total dataset size:")
    print("n_samples: %d" % n_samples)
    print("n_features: %d" % n_features)
    print("n_classes: %d" % n_classes)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    return(X_train, X_test, y_train, y_test)

In [9]:
def extract_features(X_train, X_test, n_components):
    print("Extracting the top %d eigenfaces from %d faces"
         % (n_components, X_train.shape[0]))
    pca = PCA(n_components = n_components,
             svd_solver='randomized', whiten=True).fit(X_train)
    
    eigenfaces = pca.components_.reshape((n_components, 48, 48))
    #주성분 차원바꾸기
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test) 
    
    return(X_train_pca, X_test_pca, eigenfaces)

In [29]:
def train_test_classifier(X_train_pca, X_test_pca, y_train, y_test):
    print("Fitting the classifier to the training set")
    param_grid={'C':[1e3,1e4],
               'gamma':[0.001, 0.005, 0.01],}
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    print("best estimator found by grid search")
    print(clf.best_estimator_)
    print("predicting people's names on the test set")
    y_pred = clf.predict(X_test_pca)
    print(classification_report(y_test, y_pred))

In [11]:
from matplotlib import pyplot as plt

def plot_gallery(images, n_col=5):
    n_row = round(images.shape[0]/n_col)
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0.1, left=0.01, right=0.99, top=0.90, hspace=0.35)
    
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i+1)
        #플롯값의 최대치를 흰색으로, 최소치를 검은색으로 변환 ( 특징이 되는 곳이 흰색으로 두드러지게 하는것)
        plt.imshow(images[i], cmap='gray')
        plt.xticks(())
        plt.yticks(())

In [30]:
argv = sys.argv
image_data, label = read_data('D:/DL/WAYF/Data/5country.csv')
#몇차원으로 압축?
n_eigenface = 20
X_train, X_test, y_train, y_test = create_train_test_data(image_data, label)
X_train_pca, X_test_pca, eigenface = extract_features(X_train, X_test, n_eigenface)
train_test_classifier(X_train_pca, X_test_pca, y_train, y_test)
#보여줄 필요는 없음
#plot_gallery(eigenface)
#plt.show()

total dataset size:
n_samples: 4805
n_features: 2304
n_classes: 5
Extracting the top 20 eigenfaces from 3603 faces
Fitting the classifier to the training set




best estimator found by grid search
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
predicting people's names on the test set
              precision    recall  f1-score   support

   Argentina       0.31      0.44      0.37       152
     England       0.67      0.48      0.55       385
      France       0.41      0.46      0.44       191
     Germany       0.60      0.63      0.62       276
       Spain       0.49      0.53      0.51       198

    accuracy                           0.51      1202
   macro avg       0.50      0.51      0.50      1202
weighted avg       0.54      0.51      0.52      1202



------------------

### k-means

In [49]:
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [147]:
def train_test_classifier_kmeans(X_train_pca, X_test_pca, y_train, y_test):
    clf = KMeans(n_clusters=5, init="k-means++", n_init=10, max_iter=100, random_state=42)
    clf.fit(X_train_pca) 
    print("predicting people's names on the test set")
  
 #그룹별 나라를 아무거나 붙여주면 정확도 10% 대 
 #    label_encoder = preprocessing.LabelEncoder() 
#    y_test = label_encoder.fit_transform(y_test) 
    
    y_pred = clf.predict(X_test_pca)
  
 #클러스터링으로 니온 그룹별(번호)에 나라 이름을 붙여줌. np.where(if문, 조건 o, 조건 x)
    y_pred = np.where(y_pred == 0, 'England',
         np.where(y_pred == 1, 'Germany',
                  np.where(y_pred == 4, 'France', 
                           np.where(y_pred == 3, 'Spain', 'Argentina'))))
            
    print(classification_report(y_test, y_pred))

In [148]:
train_test_classifier_kmeans(X_train_pca, X_test_pca, y_train, y_test)

predicting people's names on the test set
              precision    recall  f1-score   support

   Argentina       0.15      0.24      0.18       152
     England       0.59      0.22      0.32       385
      France       0.24      0.39      0.30       191
     Germany       0.44      0.26      0.32       276
       Spain       0.24      0.40      0.30       198

    accuracy                           0.29      1202
   macro avg       0.33      0.30      0.28      1202
weighted avg       0.38      0.29      0.30      1202

