In [4]:
import numpy as np, humanfriendly as hf, warnings
import time
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report


In [5]:
def see_time(note,start):
    end = time.perf_counter()
    elapsed = end - start
    print (note,hf.format_timespan(elapsed, detailed=True))

In [6]:
def get_cross(model, data, target, groups=10):
    return cross_val_score(model, data, target, cv=groups)

In [11]:
X = np.load('data/X_faces.npy')
y = np.load('data/y_faces.npy')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X.shape

(1288, 1850)

In [8]:
# 数据降维
pca = PCA(n_components=0.95, whiten=True, random_state=1)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [10]:
pca_name = pca.__class__.__name__
print (pca_name + 'features (before PCA):', X.shape[1],'features (after PCA):', pca.n_components_)

PCAfeatures (before PCA): 1850 features (after PCA): 135


In [12]:
sgd_model = SGDClassifier(max_iter=1000, tol=.001, random_state=0)
sgd_model.fit(X_train_pca, y_train)
y_pred = sgd_model.predict(X_test_pca)
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.89      0.57      0.70        28
           1       0.80      0.78      0.79        63
           2       0.82      0.58      0.68        24
           3       0.71      0.89      0.79       132
           4       0.52      0.55      0.54        20
           5       0.86      0.27      0.41        22
           6       0.65      0.67      0.66        33

    accuracy                           0.73       322
   macro avg       0.75      0.62      0.65       322
weighted avg       0.75      0.73      0.72       322



In [15]:
param_grid = {'alpha': [1e-3, 1e-2, 1e-1, 1e0],
              'max_iter': [1000],
              'loss': ['log', 'perceptron'],
              'penalty': ['l1'], 'tol': [.001]}
grid = GridSearchCV(sgd_model, param_grid, cv=5)
start = time.perf_counter()
grid.fit(X_train_pca, y_train)
see_time('training time:',start)
bp = grid.best_params_
print ('best parameters:',bp)



training time: 8 seconds, 544 milliseconds, 538 microseconds and 600 nanoseconds
best parameters: {'alpha': 0.001, 'loss': 'log', 'max_iter': 1000, 'penalty': 'l1', 'tol': 0.001}


In [18]:
sgd_bpmodel = SGDClassifier(**bp, random_state=1)
sgd_bpmodel.fit(X_train_pca, y_train)
y_pred = sgd_bpmodel.predict(X_test_pca)
cr = classification_report(y_test, y_pred)
print (cr)
scores = get_cross(sgd_bpmodel, X_train_pca, y_train)
print ('cross-validation:',np.mean(scores))

              precision    recall  f1-score   support

           0       0.70      0.57      0.63        28
           1       0.78      0.81      0.80        63
           2       0.68      0.71      0.69        24
           3       0.83      0.84      0.84       132
           4       0.47      0.70      0.56        20
           5       0.90      0.41      0.56        22
           6       0.69      0.76      0.72        33

    accuracy                           0.75       322
   macro avg       0.72      0.69      0.69       322
weighted avg       0.77      0.75      0.75       322

cross-validation: 0.773450773120741
