In [7]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [8]:
from fastai import *
from fastai.vision import *

In [9]:
path = Path().home()/'data/aircraft/train'

In [10]:
classes = [c.name for c in path.iterdir() if c.is_dir()]

In [12]:
removal_idxs = defaultdict(lambda: [])
removal_list = pd.read_pickle(path.parent/'removals.pkl')
for item in removal_list:
    k,v = item[0],item[1]
    removal_idxs[k] = v

## 1. Remove Classes to be excluded:

List all classes except for those that'll be excluded, and print the number of images per class before and after cleaning:

In [14]:
excludes = ['typhoon','f4','tornado', 'a10', 'j20', 'fighters', 'cargo', 'models']
for i,c in enumerate(classes):
    if c not in excludes:
        print(f"{c:<10}{len((path/c).ls()):<6}{len((path/c).ls())-len(removal_idxs[c]):}")

f16       618   558
su24      476   369
mig29     546   476
su22      503   346
su25      507   402
mig21     623   525
fa18ef    448   165
f15e      564   418
f15c      542   449
fa18c     561   439
f35       532   494
su30      510   422
su27      569   396
mig25     493   325
rafale    329   325
f14       574   436
f22       514   389
su57      430   310
mig31     542   400
su34      508   421
jas39     459   382


In [7]:
excludes = ['typhoon','f4','tornado', 'a10', 'j20', 'fighters', 'cargo']
for i,c in enumerate(classes):
    if c not in excludes:
        print(f"{c:<10}{len((path/c).ls()):<6}{len((path/c).ls())-len(removal_idxs[c]):}")

rafale    331   327
f15c      494   401
mig21     493   395
f16       362   302
su27      492   319
f15e      490   344
jas39     494   417
f35       369   331
su24      477   370
su34      476   389
fa18ef    482   199
su25      490   385
mig29     483   413
fa18c     492   370
mig31     482   340
f14       491   353
f22       489   364
su57      364   244
su30      377   289
mig25     367   199
su22      377   220


Remove all classes in `excludes`:

In [15]:
for c in excludes:
    os.system(f'rm -rf {path/c}') # Warning: this destroys without question

In [16]:
path.ls()

[PosixPath('/home/jupyter/data/aircraft/train/rafale'),
 PosixPath('/home/jupyter/data/aircraft/train/f15c'),
 PosixPath('/home/jupyter/data/aircraft/train/mig21'),
 PosixPath('/home/jupyter/data/aircraft/train/f16'),
 PosixPath('/home/jupyter/data/aircraft/train/su27'),
 PosixPath('/home/jupyter/data/aircraft/train/f15e'),
 PosixPath('/home/jupyter/data/aircraft/train/jas39'),
 PosixPath('/home/jupyter/data/aircraft/train/f35'),
 PosixPath('/home/jupyter/data/aircraft/train/su24'),
 PosixPath('/home/jupyter/data/aircraft/train/su34'),
 PosixPath('/home/jupyter/data/aircraft/train/fa18ef'),
 PosixPath('/home/jupyter/data/aircraft/train/su25'),
 PosixPath('/home/jupyter/data/aircraft/train/mig29'),
 PosixPath('/home/jupyter/data/aircraft/train/fa18c'),
 PosixPath('/home/jupyter/data/aircraft/train/mig31'),
 PosixPath('/home/jupyter/data/aircraft/train/f14'),
 PosixPath('/home/jupyter/data/aircraft/train/f22'),
 PosixPath('/home/jupyter/data/aircraft/train/su57'),
 PosixPath('/home/jupyt

Update the list of classes:

In [17]:
for e in excludes: classes.remove(e)

## 2. Clean Data:

Use the index labels to get the filenames:

### 2.1. Test Run:

In [19]:
c = 'rafale'
removals_list = removal_idxs[c]

In [20]:
removals_list

['185', '195', '332', '350']

In [23]:
data_list = [l.name for l in (path/c).ls()]

In [25]:
data_list[:10]

['48. 243120964-e1539792550961.jpg',
 '234. rafale-jet-fighter-plane-600x338.jpg',
 '252. rafale_india_reuters_28.1.18.gif',
 '241. 1486268359-rafale-aircraft517.jpg',
 '327. dassault-rafale-weapons-load.jpg',
 '280. rafale_m_asmp-a-nuclear_missile_french_navy_marine_nationale.jpg',
 '321. rafale-egypte-20150720.jpg',
 '201. egypt-rafale.jpg',
 '352. rafale.jpg',
 '161. rafale-aug6-1.jpeg']

If I turn the list into a NumPy array can I do things faster by applying a filter to it?

In [44]:
%timeit temp = list(map(lambda x: x.split('.')[0], data_list))
print(temp[:10])

102 µs ± 711 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
['48', '234', '252', '241', '327', '280', '321', '201', '352', '161']


In [45]:
%timeit temp = list(map(lambda x: x.split('.')[0], np.array(data_list)))
print(temp[:10])

334 µs ± 3.75 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
['48', '234', '252', '241', '327', '280', '321', '201', '352', '161']


Looks like you can do it with a normal list; not sure what to think by the times. Too small to tell. Anyway it works.

In [46]:
idx_labels = list(map(lambda x: x.split('.')[0], data_list))
removal = removals_list[0]

In [62]:
np.where(np.array(idx_labels) == removal)

(array([50]),)

In [63]:
np.where(np.array(idx_labels) == removal)[0][0]

50

In [64]:
removal, idx_labels[50], data_list[50]

('185', '185', '185. rafale.jpg')

### 2.2. Automated run:

In [73]:
for e in excludes:
    try: removal_idxs.pop(e)
    except KeyError: pass

#### 2.2.1. Dry Run:

In [83]:
for c in removal_idxs.keys():
    print(f"\n{c:<10}{' -'*40}\n")
    removals_list = removal_idxs[c]
    data_list = [l.name for l in (path/c).ls()]
    
    for i in data_list:
        if i.split('.')[0] in removal_idxs[c]:
            print(i)


rafale     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

185. rafale.jpg
195. rafale_29.3_3.jpg
350. b-52-escorted-by-rafale.jpg
332. large-150908874526691008.jpg

f15c       - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

300. su-30s.jpg
395. f-15_and_f-22.jpg
202. 1000w_q95.jpg
255. u-s-air-force-f-15c-jets-have-just-started-historic-first-deployment-to-ukraine.jpg
440. d72cc2d90220df045f3772f7e0e85e25147225cd_1_690x355.png
484. f15abms-4-stroke-15hp-compatible-with-f15c-model-boat-engine.jpg
447. 070411-f-1234b-001.jpg
292. 04.jpg
260. rtx36t1k_1.jpg
493. dragon-50071-boeing-f15c-eagle-58th_360_e632bf1736eb0ab53fb09a95cd3c04de.jpg
435. 160824-f-qj658-089.jpg
499. f15abms-4-stroke-15hp-compatible-with-f15c-model-boat-engine.jpg
252. us-military-planes-that-could-be-used-for-geoengineering-a-f-15c-eagle.png
483. 2655.jpg
317. 53d-fs-5.jpg
400. z23.jpg
462. 89422_0.jpg
77. 16131567332_4d6aa1937a_z.jpg
104. f-15s-f-16s

#### 2.2.2. Real Run:

In [84]:
print(f"Removals:") # was supposed to have this outside the loop
for c in removal_idxs.keys():
#     print(f"\n{c:<10}{' -'*40}\n")
    removals_list = removal_idxs[c]
    data_list = [l.name for l in (path/c).ls()]
    
    removal_count = 0
    
    for i in data_list:
        if i.split('.')[0] in removal_idxs[c]:
            os.remove(path/c/i)
            removal_count += 1
            
    print(f"{c:<8}{removal_count}")

Removals:
rafale  4
Removals:
f15c    93
Removals:
mig21   98
Removals:
f16     59
Removals:
su27    172
Removals:
f15e    145
Removals:
jas39   75
Removals:
f35     38
Removals:
su24    107
Removals:
su34    86
Removals:
fa18ef  280
Removals:
su25    104
Removals:
mig29   70
Removals:
fa18c   121
Removals:
mig31   139
Removals:
f14     137
Removals:
f22     124
Removals:
su57    119
Removals:
su30    88
Removals:
mig25   167
Removals:
su22    157


Quick check to make sure the numbers removed matches what was supposed to be removed:

In [88]:
for i in removal_idxs.items():
    print(f"{i[0]:<8} {len(removal_idxs[i[0]])}")

rafale   4
f15c     93
mig21    98
f16      60
su27     173
f15e     146
jas39    77
f35      38
su24     107
su34     87
fa18ef   283
su25     105
mig29    70
fa18c    122
mig31    142
f14      138
f22      125
su57     120
su30     88
mig25    168
su22     157


Odd, a couple are missing here and there. Whatever. Looks like some counting mistake.