### Machine learning program to identify images of power-meters

### caveats:
### The model is trained on jpg images hence only jpg images can be tested 

In [188]:
# data processing libraries
import numpy as np
from pathlib import Path
from PIL import Image
from glob import glob

In [189]:
# importing ML tools
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [190]:
# command-line path to images on local drive
yesmeter_images_path = Path.cwd() / 'sagar' / 'meterdata' / 'yesmeter' / '*.jpg'  # images of meters
notmeter_images_path = Path.cwd() / 'sagar' / 'meterdata' / 'notmeter' / '*.jpg'  # other images

In [191]:
# list of image file-names
yesmeter_filename_list = glob(str(yesmeter_images_path))
notmeter_filename_list = glob(str(notmeter_images_path))

### This function turns images into their corresponding array of pixels

In [192]:
def trim_image_to_square(file_path):    
    """the function crops the image to a square, then resizes it to 96 px X 96 px and returns a 1-d array of its pixels """
    img = Image.open(file_path)
    width = img.width
    height = img.height
    if width < height:
        crop_from_top = (height - width) // 2
        crop_from_bottom = crop_from_top + width
        squared_img = img.crop((0, crop_from_top, width, crop_from_bottom)).resize((96, 96))
        arr_data = np.array(squared_img)
    else:
        crop_from_left = (width - height) // 2
        crop_from_right = crop_from_left + height
        squared_img = img.crop((crop_from_left, 0, crop_from_right, height)).resize((96, 96))
        arr_data = np.array(squared_img) 
    return arr_data.flatten() / 255 # standardizing

### Preparing the data

In [193]:
# numpy matrix of few meter-images
meter_images_array = np.zeros(shape=(1, 27648), dtype=np.int64)  # 27648 is 96 X 96 X 3. The 3 is for RGB

number_of_images_selected = 40
for each in yesmeter_filename_list[:number_of_images_selected]:
    data = trim_image_to_square(each)
    meter_images_array = np.append(meter_images_array, [data], axis=0)
meter_images_array = meter_images_array[1:] # leaving out the top row of zeros

In [194]:
# Labels for meter images, the lables are '1' because the images contain power-meters
meter_labels = np.ones(shape=meter_images_array.shape[0])
meter_labels_matrix = np.c_[meter_labels, meter_images_array]

In [195]:
# numpy matrix of few non-meter-images
not_meter_images_array = np.zeros(shape=(1, 27648), dtype=np.int64)

number_of_images_selected = 30
for each in notmeter_filename_list[:number_of_images_selected]:
    data = trim_image_to_square(each)
    not_meter_images_array = np.append(not_meter_images_array, [data], axis=0)
not_meter_images_array = not_meter_images_array[1:] # leaving out the top row of zeros

In [196]:
# Labels for non-meter images, the lables are '0' because the images contain no power-meters
not_meter_labels = np.zeros(shape=not_meter_images_array.shape[0])
not_meter_labels_matrix = np.c_[not_meter_labels, not_meter_images_array]

In [197]:
# combining matrices of both types of images:
test_data_matrix = np.r_[meter_labels_matrix, not_meter_labels_matrix]
test_data_matrix.shape

(70, 27649)

In [198]:
X = test_data_matrix[:, 1:]  # features matrix
y = test_data_matrix[:, 0] # target vector

### Selecting the best classifier out of four

In [199]:
score_data = list()
for name, model in (('knn_pipe', KNeighborsClassifier()),
('SVC', SVC()),
('LoR', LogisticRegression()), ('Random_forest', RandomForestClassifier())):
    score = cross_val_score(estimator=model,
    X=X,
    y=y,
    scoring='accuracy',
    cv=3)
    score_data.append((name, score.mean()))
    
for each in score_data:
    print(each)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

('knn_pipe', 0.8001207729468599)
('SVC', 0.8580917874396136)
('LoR', 0.8140096618357489)
('Random_forest', 0.8291062801932366)


### The best classifier is SVC (Support Vector CLassifier) with an accuracy of 85%

In [200]:
# tuning its hyperparameters of SVC
params = [{'gamma': ['scale', 'auto']}, {'degree': [3, 4]}, {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}]
gsc = GridSearchCV(estimator=SVC(),
    param_grid=params,
    scoring='accuracy')

gsc.fit(X, y)

In [201]:
gsc.best_params_

{'gamma': 'scale'}

### Training the model with the best parameters

In [202]:
classifier = SVC() # default parameters are proven best
classifier.fit(X, y)
print('model is trained')

model is trained


### Testing a randomly selected image of a power-meter

In [203]:
image_to_test = yesmeter_filename_list[70]
test_data = trim_image_to_square(image_to_test)
test_data = test_data.reshape(1, -1)
classifier.predict(test_data)

array([1.])

### Testing a randomly selected image that is not a power-meter

In [204]:
notmeter_image_to_test = notmeter_filename_list[99]
notmeter_test_data = trim_image_to_square(notmeter_image_to_test)
notmeter_test_data = notmeter_test_data.reshape(1, -1)
classifier.predict(notmeter_test_data)

array([0.])

### Preserving the trained model for distribution

In [205]:
# preserving the model
import pickle

In [206]:
pickle.dump(classifier, open('svc_meter_model.pkl', 'wb'))
print('model is saved')

model is saved


### Limitations and scope of improvement

In [2]:
# The efficiency of the model is limited by the following factors
# 1. The processing capability of my low-end laptop
# 2. The small size of the training images (750 images of power meters and 250 images of mobile phones)

### ~ End ~