Imports

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#Classifiers
from sklearn.ensemble import RandomForestClassifier

# Common imports
import numpy as np
import pandas as pd
from random import randint
import os
import tarfile
import cv2
import pickle

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Set numpy's random-state to 42 to make this notebook's output stable across runs.

In [2]:
np.random.seed(42)

**Reload data** decides if a new dataframe has to be created. \
Default is False, but will be set to True if no dataframe exists yet.\
Can be set to True to force the program to overwrite an existing dataframe.


In [3]:
RELOAD_DATA = False

Helper function 'target_value'.

In [4]:
def target_value(val):
    if val == 'aanwezig':
        return 2
    if val == 'buiten':
        return 1
    return 0

Check if directory 'model' exists.

In [5]:
if not os.path.isdir('./model/'):
    os.mkdir('./model/')

Check if a dataframe exists, and create a new one if this is not the case.
Else load the existing dataframe.

In [6]:
if not os.path.isfile('./model/dataframe.sav'):
    RELOAD_DATA = True # When there is not yet a dataframe, create one

if RELOAD_DATA: # Check whether there needs to be created a new dataframe
    #Extract when not already extracted
    if not os.path.isdir('./data/classificatie'):
        if not os.path.isfile('./data/classificatie.tar'):
            raise Exception('Classificatie.tar not fount')

        print('Extracting tar...')
        tar = tarfile.open('./data/classificatie.tar')
        tar.extractall('./data/')
        tar.close()
        print('Extracting tar Done!')

    if not os.path.isdir('./data/classificatie'):
        raise Exception('Extracted files not found')


    # Get grayscale values from pictures
    print('Creating dataframe')
    samples = []
    sample_counter = 0
    musti = pd.DataFrame()

    for folder in os.listdir('./data/classificatie/'):
        for file in os.listdir(f'./data/classificatie/{folder}'):
            if target_value(folder) == 1 or randint(1, 5) ==1:
                img = cv2.imread(f'./data/classificatie/{folder}/{file}', 0)
                img = cv2.resize(img, (320, 176))

                # add them to a dataframe
                imgd = dict()
                imgd['target'] = target_value(folder)
                c = 0
                for i in img.flatten():
                    c += 1
                    imgd[f'p{c}'] = i
                samples.append(imgd)
                sample_counter+=1
                #print(file)

                if sample_counter % 200==0:
                    temp_df = pd.DataFrame.from_dict(samples)
                    musti = musti.append(temp_df, ignore_index=True)
                    samples = []
    temp_df = pd.DataFrame.from_dict(samples)
    musti = musti.append(temp_df, ignore_index=True)
    samples = []

    print('Saving DataFrame')
    
    pickle.dump(musti, open('./model/dataframe.sav', 'wb'))
else:
    print('Loading DataFrame')
    musti = pickle.load(open('./model/dataframe.sav', 'rb'))

print('DataFrame Loaded')
print(musti)

Loading DataFrame
DataFrame Loaded
     target   p1   p2   p3   p4   p5   p6   p7   p8   p9  ...  p56311  p56312  \
0         2  107  116  116  110  110  106  110  111  109  ...      99      53   
1         2  109  121  122  113  113  112  115  111  109  ...     -47    -125   
2         2  107  119  120  111  113  109  110  112  107  ...     -50     126   
3         2   90   96   97   89   90   90   92   98   89  ...     109      80   
4         2   95   94   96   94   93   92   92   96   90  ...     111      84   
..      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...     ...     ...   
812       0   46   45   42   49   41   40   44   44   45  ...      63      68   
813       0   99  110  107  102  105  100  102  107  101  ...      59      65   
814       0  111  120  120  114  111  112  115  114  106  ...      44      50   
815       0   95  105  100   96   94   94   95  102   96  ...      76      82   
816       0   70   75   75   76   76   78   80   82   75  ...      81     

Create training and test data

In [7]:
X, y = musti.drop('target', axis=1), musti['target']
y = y.astype(np.uint8)  # less RAM space

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training set Shape: {X_train.shape}')

Training set Shape: (653, 56320)


Create and fit the chosen model

In [8]:
model = RandomForestClassifier(n_estimators=200, random_state=42, min_samples_leaf=10)
model = model.fit(X_train, y_train)

Evaluate the chosen model and print the results

In [None]:
a = cross_val_score(model, X_test, y_test, cv=3)
print(f'\t{a}')
print(f'\tmean: {np.mean(a)}')