## Data clensing

### References:
- https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/pipelines#transformers.ZeroShotImageClassificationPipeline.example
- https://huggingface.co/docs/transformers/installation

### Obtaining and unpacking data

### Installing and importing stuff

In [None]:
!pip install transformers
import os
from transformers import pipeline
from statistics import mean
from tqdm.notebook import tqdm

### Testing the classifier pipeline

In [None]:
classifier = pipeline(model="openai/clip-vit-large-patch14", device=0)

classifier(
    "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    candidate_labels=["animals", "humans", "landscape"],
)

In [None]:
import torch
if torch.cuda.is_available():
  print(torch.cuda.get_device_name(0))

In [None]:
classifier("cat.png", candidate_labels=["cats",""], device=0)

In [None]:
%pwd

## A cat? or not a cat.

In [None]:
%cd /home/studio-lab-user/sagemaker/data
folders = sorted(os.listdir())

if ".ipynb_checkpoints" in folders: folders.remove(".ipynb_checkpoints")
#exclude automatically generated folder from the cat scan

cantFindCat = []
print("Starting cat check")
for folder in tqdm(folders):
  tqdm.write(f"Starting cat check for folder {folder}\n")
  os.chdir(folder)

  files = sorted(os.listdir())
  if ".ipynb_checkpoints" in files: files.remove(".ipynb_checkpoints")
  catStats=[]
  for file in tqdm(files):
    location = f"{folder}/{file}"
    classifierOutput = classifier(file, candidate_labels=['cats',''])
    #Putting only one item in the candidate_labels list throw an error

    #The output of candidate_labels sorts by confidence, so if the label 'cats' appear on the first one, 
    #it will have to have more than 50% confidence.

    if classifierOutput[0]['label'] == 'cats':
      catScore = classifierOutput[0]['score']
      #tqdm.write(f"Cat check passed for {location} with - {catPercentage}%")
    else:
      catScore = classifierOutput[1]['score']
      cantFindCat.append(location)
      tqdm.write(f"404 - CAT NOT FOUND @ {location} - {round(catScore * 100, 2)}% cat")

    catStats.append(catScore)  

  tqdm.write(f"\nCat check complete for folder {folder}, with an average cat score of {mean(catStats)}")
  tqdm.write("----------------------------------------\n")
  os.chdir('..')

In [None]:
%cd /home/studio-lab-user/sagemaker/data
print("RESULTS: ====================")
print("\n".join(cantFindCat))

from PIL import Image as im

for image in cantFindCat:
  #im.open(image).show()
  pass

In [None]:
%cd /content
response = input("Do you want to remove the images where we can't find the cat? (y/n)")

cantFindCat = sorted(list(set(cantFindCat)))
#removing duplicates (if any)

if response.lower=="y":
  for item in cantFindCat:
    try:
      os.remove(item)
      print(f"Removed {item}")
    except:
      print(f"Can't remove {item}")
  
  #TODO: zip + send to drive
  
else:
  print("Ok, bye!")

In [None]:
%pwd
%ls

In [None]:
!conda install -y -c conda-forge zip

In [None]:
!zip ../cat-data-cleaned.zip -r . -0