Imports

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

#Classifiers
from sklearn.ensemble import RandomForestClassifier

# Common imports
import numpy as np
import pandas as pd
from random import randint
import os
import tarfile
import cv2
import pickle

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Set numpy's random-state to 42 to make this notebook's output stable across runs.

In [2]:
np.random.seed(42)

**Reload data** decides if a new dataframe has to be created. \
Default is False, but will be set to True if no dataframe exists yet.\
Can be set to True to force the program to overwrite an existing dataframe.


In [3]:
RELOAD_DATA = False

Helper function 'target_value'.

In [4]:
def target_value(val):
    if val == 'aanwezig':
        return 2
    if val == 'buiten':
        return 1
    return 0

Check if directory 'model' exists.

In [5]:
if not os.path.isdir('./model/'):
    os.mkdir('./model/')

Check if a dataframe exists, and create a new one if this is not the case.
Else load the existing dataframe.

In [6]:
if not os.path.isfile('./model/dataframe.sav'):
    RELOAD_DATA = True # When there is not yet a dataframe, create one

if RELOAD_DATA: # Check whether there needs to be created a new dataframe
    #Extract when not already extracted
    if not os.path.isdir('./data/classificatie'):
        if not os.path.isfile('./data/classificatie.tar'):
            raise Exception('Classificatie.tar not fount')

        print('Extracting tar...')
        tar = tarfile.open('./data/classificatie.tar')
        tar.extractall('./data/')
        tar.close()
        print('Extracting tar Done!')

    if not os.path.isdir('./data/classificatie'):
        raise Exception('Extracted files not found')


    # Get grayscale values from pictures
    print('Creating dataframe')
    samples = []
    sample_counter = 0
    musti = pd.DataFrame()

    for folder in os.listdir('./data/classificatie/'):
        for file in os.listdir(f'./data/classificatie/{folder}'):
            img = cv2.imread(f'./data/classificatie/{folder}/{file}', 0)
            img = cv2.normalize(img,np.zeros(img.shape), 0, 1000, cv2.NORM_MINMAX)

            # add them to a dataframe
            imgd = dict()
            imgd['target'] = target_value(folder)
            c = 0
            for i in img.flatten():
                c += 1
                imgd[f'p{c}'] = i
            samples.append(imgd)
            sample_counter+=1
            #print(file)

            if sample_counter % 200==0:
                temp_df = pd.DataFrame.from_dict(samples)
                musti = musti.append(temp_df, ignore_index=True)
                samples = []
    temp_df = pd.DataFrame.from_dict(samples)
    musti = musti.append(temp_df, ignore_index=True)
    samples = []

    print('Saving DataFrame')
    
    pickle.dump(musti, open('./model/dataframe.sav', 'wb'))
else:
    print('Loading DataFrame')
    musti = pickle.load(open('./model/dataframe.sav', 'rb'))

print('DataFrame Loaded')
print(musti)

Creating dataframe
Saving DataFrame
DataFrame Loaded
     target  p1   p2  p3  p4  p5  p6  p7  p8  p9  ...  p225271  p225272  \
0         2  -1   -1  -1  -1  -1  -1  -1  -1  -1  ...     -125     -121   
1         2  -1   -7  -1  -1  -1  -1  -1  -1  -1  ...      102      106   
2         2  -1 -111  -1 -15  -1 -53  -1  -1  -1  ...      -24      -24   
3         2  -1 -113  -1  -1  -1 -19  -1  -1  -1  ...       -1       -1   
4         2  -1  126  -1  -4  -1  -1 -20  -1  -1  ...       -1       -1   
..      ...  ..  ...  ..  ..  ..  ..  ..  ..  ..  ...      ...      ...   
829       0  -1  -40  -1  -1  -1  -1 -26  -1  -1  ...     -106      -96   
830       0  -1  -36  -1  -1  -1  -1 -44  -1  -1  ...      -44      -40   
831       0  -1  110  -1 -23  -1 -71 -58  -1  -1  ...      -80      -62   
832       0  -1  123  -1 -44  -1 -87 -23  -1  -1  ...      -31      -19   
833       0  -1  -26  -1  -1  -1  -1  -1  -1  -1  ...       -1       -1   

     p225273  p225274  p225275  p225276  p2252

In [7]:
smallest_dataset_len = min(len(musti[musti.target == 2]), len(musti[musti.target == 1]), len(musti[musti.target == 0]))
for i in range(3):
    frac = 1 - smallest_dataset_len / len(musti[musti.target == i])
    print(frac, i)
    musti = musti.drop(musti.query(f'target == {i}').sample(frac=frac).index)

print(len(musti[musti.target == 2]), len(musti[musti.target == 1]), len(musti[musti.target == 0]))

0.4107142857142857 0
0.5758354755784062 1
0.0 2
165 165 165


Create training and test data

In [8]:
X, y = musti.drop('target', axis=1), musti['target']
y = y.astype(np.uint8)  # less RAM space

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training set Shape: {X_train.shape}')

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Training set Shape: (396, 225280)


Create and fit the chosen model

RandomForestClassifier

In [9]:
model = RandomForestClassifier()
print('fitting model')

#optimal params found with GridSearchCV for RandomForestClassifier
param_grid = [
    {'n_estimators': [135], 
     'max_features': [18],
     'max_depth': [36],
     'min_samples_split': [2],
     'min_samples_leaf':[1],
     'random_state': [42]},
    ]

gridsearch = GridSearchCV(model, param_grid, cv=3, verbose=2)
print(gridsearch)
gridsearch.fit(X_train, y_train)

print(gridsearch.best_params_,gridsearch.best_score_)

fitting model
GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid=[{'max_features': [2, 4, 8],
                          'n_estimators': [10, 100, 200],
                          'random_state': [42]}],
             verbose=2)
Fitting 3 folds for each of 9 candidates, totalling 27 fits


Evaluate the chosen model and print the results

In [None]:
a = cross_val_score(model, X_test, y_test, cv=3)
print(f'\t{a}')
print(f'\tmean: {np.mean(a)}')