In [5]:
# USAGE
# python train_model.py --db ../datasets/animals/hdf5/features.hdf5 \
#	--model animals.cpickle
# python train_model.py --db ../datasets/caltech-101/hdf5/features.hdf5 \
#	--model caltech101.cpickle
# python train_model.py --db ../datasets/flowers17/hdf5/features.hdf5 \
#	--model flowers17.cpickle

# import the necessary packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import argparse
import pandas as pd
import numpy as np
import pickle
import h5py
from datetime import datetime
start_time = datetime.now()
# do your work here

# construct the argument parse and parse the arguments
# ap = argparse.ArgumentParser()
# ap.add_argument("-d", "--db", required=True,
#     help="path HDF5 database")
# ap.add_argument("-m", "--model", required=True,
#     help="path to output model")
# ap.add_argument("-j", "--jobs", type=int, default=-1,
#     help="# of jobs to run when tuning hyperparameters")
# args = vars(ap.parse_args())

dataset = '../datasets/fruit/images/'
h5db = 'git_output/feat_extract_vgg16_augmentation_fruit_notebook.hdf5'
model_output_path = 'git_output/feat_extract_vgg16_fruit_notebook.model'
jobs = -1


# grab the list of images that we'll be describing, then extract
# the class label names from the image paths
# print("[INFO] loading images...")
# imagePaths = list(paths.list_images(dataset))
# classNames = [pt.split(os.path.sep)[-2] for pt in imagePaths]
# classNames = [str(x) for x in np.unique(classNames)]


# open the HDF5 database for reading then determine the index of
# the training and testing split, provided that this data was
# already shuffled *prior* to writing it to disk
db = h5py.File(h5db, "r")
i = int(db["labels"].shape[0] * 0.75)

# define the set of parameters that we want to tune then start a
# grid search where we evaluate our model for each value of C
print("[INFO] tuning hyperparameters...")
params = {"C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]}
model = GridSearchCV(LogisticRegression(solver="lbfgs",
    multi_class="auto"), params, cv=3, n_jobs=jobs)
model.fit(db["features"][:i], db["labels"][:i])
print("[INFO] best hyperparameters: {}".format(model.best_params_))

# evaluate the model
print("[INFO] evaluating...")
preds = model.predict(db["features"][i:])
print(classification_report(db["labels"][i:], preds,
    target_names=db["label_names"]))

# output classification report to CSV
report = classification_report(db["labels"][i:], preds,
    output_dict=True, target_names=db["label_names"])
df = pd.DataFrame(report).transpose()
df.to_csv('git_output/feat_extract_vgg16_soda_notebook.csv')

# serialize the model to disk
print("[INFO] saving model...")
f = open(model_output_path, "wb")
f.write(pickle.dumps(model.best_estimator_))
f.close()

end_time = datetime.now()
print('*'* 50, '\n[INFO] Duration: {}'.format(end_time - start_time), '\n', '*'*50)


# close the database
db.close()

[INFO] tuning hyperparameters...
[INFO] best hyperparameters: {'C': 1.0}
[INFO] evaluating...
              precision    recall  f1-score   support

       Apple       0.86      0.93      0.89       123
      Banana       0.95      0.91      0.93        78
       Grape       0.94      0.88      0.91        90
      Orange       0.96      0.96      0.96       119
  Watermelon       0.93      0.93      0.93        67

    accuracy                           0.92       477
   macro avg       0.93      0.92      0.92       477
weighted avg       0.92      0.92      0.92       477

[INFO] saving model...
************************************************** 
[INFO] Duration: 0:01:04.076175 
 **************************************************


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
