Imports

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#Classifiers
from sklearn.ensemble import RandomForestClassifier

# Common imports
import numpy as np
import pandas as pd
import os
import tarfile
import cv2
import pickle

Set numpy's random-state to 42 to make this notebook's output stable across runs.

In [2]:
np.random.seed(42)

**Reload data** decides if a new dataframe has to be created. \
Default is False, but will be set to True if no dataframe exists yet.\
Can be set to True to force the program to overwrite an existing dataframe.


In [3]:
RELOAD_DATA = False

Helper function 'target_value'.

In [4]:
def target_value(val):
    if val == 'aanwezig':
        return 2
    if val == 'buiten':
        return 1
    return 0

Check if directory 'model' exists.

In [5]:
if not os.path.isdir('./model/'):
    os.mkdir('./model/')

Check if a dataframe exists, and create a new one if this is not the case.
Else load the existing dataframe.

In [6]:
if not os.path.isfile('./model/dataframe.sav'):
    RELOAD_DATA = True # When there is not yet a dataframe, create one

if RELOAD_DATA: # Check whether there needs to be created a new dataframe
    #Extract when not already extracted
    if not os.path.isdir('./data/classificatie'):
        if not os.path.isfile('./data/classificatie.tar'):
            raise Exception('Classificatie.tar not fount')

        print('Extracting tar...')
        tar = tarfile.open('./data/classificatie.tar')
        tar.extractall('./data/')
        tar.close()
        print('Extracting tar Done!')

    if not os.path.isdir('./data/classificatie'):
        raise Exception('Extracted files not found')

    samples = []

    # Get grayscale values from pictures
    print('Creating dataframe')
    for folder in os.listdir('./data/classificatie/'):
        for file in os.listdir(f'./data/classificatie/{folder}'):

            img = cv2.imread(f'./data/classificatie/{folder}/{file}', 0)
            img = cv2.resize(img, (320, 176))

            # add them to a dataframe
            imgd = dict()
            imgd['target'] = target_value(folder)
            c = 0
            for i in img.flatten():
                c += 1
                imgd[f'p{c}'] = i
            samples.append(imgd)
            print(file)

    print('Saving DataFrame')
    musti = pd.DataFrame.from_records(samples)
    pickle.dump(musti, open('./model/dataframe.sav', 'wb'))
else:
    print('Loading DataFrame')
    musti = pickle.load(open('./model/dataframe.sav', 'rb'))

print('DataFrame Loaded')

Loading DataFrame
DataFrame Loaded


Create training and test data

In [7]:
X, y = musti.drop('target', axis=1), musti['target']
y = y.astype(np.uint8)  # less RAM space

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training set Shape: {X_train.shape}')

Training set Shape: (2112, 56320)


Create and fit the chosen model

In [16]:
model = RandomForestClassifier(n_estimators=400, random_state=42)
model = model.fit(X_train, y_train)

Evaluate the chosen model and print the results

In [17]:
a = cross_val_score(model, X_test, y_test, cv=3)
print(f'\t{a}')
print(f'\tmean: {np.mean(a)}')

	[0.80113636 0.90909091 0.86931818]
	mean: 0.8598484848484849
