In [1]:
!pip install transformers
!pip install datasets


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 75.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [2]:
from zipfile import ZipFile
from io import BytesIO
import requests
r = requests.get("http://cimalab.unal.edu.co/applications/thyroid/thyroid.zip")
print("Downloading...")
z = ZipFile(BytesIO(r.content))    
z.extractall("/content/thyroid")



Downloading...


In [3]:
import os
import xml.etree.ElementTree as ET
from natsort import natsorted
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt


# XML and Jpeg     
def to_dataframe(path):
    dirs=natsorted(os.listdir(path))
    xml_list=[]
    img_list=[]
    for i in range(len(dirs)):
        if '.xml' in dirs[i]:
            xml_list.append(dirs[i])
        if not '.xml'  in dirs[i]:
            img_list.append(dirs[i])
    xml_list=natsorted(xml_list)
    img_list=natsorted(img_list)
    tirads=[]
    for j in range(len(xml_list)):
        tree = ET.parse(path+'/'+xml_list[j])
        a=tree.findall("./tirads")
        if a[-1].text!=None:
            case=[xml_list[j],a[-1].text]
            tirads.append(case)
    data=[]
    for k in range(len(tirads)):
        xml=tirads[k][0][:-4]
        for z in range(len(img_list)):
            if xml+'_1.jpg'==img_list[z] or xml+'_2.jpg'==img_list[z] or xml+'_3.jpg'==img_list[z]:
                m=[img_list[z],tirads[k][1]]
                data.append(m)

    df = pd.DataFrame(data,columns =['Jpeg_Name', 'Tirads'])
    return df

#Cropp Function
def croping(img,x, y, w, h):
    if abs(w)<abs(h):
        img2=np.zeros([h,h])
        img2[:,h-w:h]=img[y:y+h, x:x+w]
    if abs(h)<abs(w):  
        img2=np.zeros([w,w])
        img2[w-h:w,:]=img[y:y+h, x:x+w]
    else:
        return img
    return img2

def convert_one_channel(img):
    #if some images have 3 channels , although they are grayscale image
    if len(img.shape)>2:
        img=img[:,:,0]
        return img
    else:
        return img

#Remove Fill area from Image and Resizeing
def crop_resize(path,resize_shape):
    img=plt.imread(path)
    img=convert_one_channel(np.asarray(img))    
    kernel =( np.ones((5,5), dtype=np.float32))
    ret,thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY)
    thresh = thresh.astype(np.uint8)
    a1,b1=thresh.shape
    thresh=cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel,iterations=3 )
    thresh=cv2.erode(thresh,kernel,iterations =5)
    contours, hierarchy = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    c_area=np.zeros([len(contours)])
    for i in range(len(contours)):
        c_area[i]= cv2.contourArea(contours[i]) 
    cnts=contours[np.argmax(c_area)]    
    x, y, w, h = cv2.boundingRect(cnts)
    roi = croping(img, x, y, w, h)
    roi=cv2.resize(roi,(resize_shape),interpolation=cv2.INTER_LANCZOS4)


    return roi


# TO Data Matrix
def to_imgmatrix(resize_shape,path,df):
    path=path+'/'  
    images=[]
    for i in range (0,len(df["Jpeg_Name"])):
        img=crop_resize(path+df["Jpeg_Name"][i],resize_shape)
            #Convert RGB to use pretrained model 
        img=np.uint8(255*(img/(np.max(img))))
        img = cv2.cvtColor(img,cv2.COLOR_GRAY2RGB)
        images.append(img)
    return images

def prepare_data(path,resize_shape):
    df=to_dataframe(path)
    data=to_imgmatrix(resize_shape,path,df) 
    return df,data


def to_categoricalmatrix(df):
    #There are little categories, so i handled manually
    Y=np.zeros([len(df["Tirads"])])
    for i in range(len(df["Tirads"])):
        if df["Tirads"][i]=="2":
          Y[i]=0
        if df["Tirads"][i]=="3":
          Y[i]=0
        if df["Tirads"][i]=="4a":
          Y[i]=1
        if df["Tirads"][i]=="4b":
          Y[i]=1
        if df["Tirads"][i]=="4c":
          Y[i]=1
        if df["Tirads"][i]=="5":
          Y[i]=1
    return Y.tolist()



df,data=prepare_data("/content/thyroid",(256,256))


In [4]:
df.head()

Unnamed: 0,Jpeg_Name,Tirads
0,2_1.jpg,2
1,3_1.jpg,4a
2,4_1.jpg,4a
3,5_1.jpg,5
4,6_1.jpg,4b


In [5]:
y=to_categoricalmatrix(df)

In [6]:
import os
directory = "data"
parent_dir = "/content"
path = os.path.join(parent_dir, directory)
os.makedirs(path)

for i in range(2):
  directory=str(i)
  sub_path=os.path.join(path, directory)
  os.makedirs(sub_path)

In [7]:
for j in range(len(data)):
  label_path=str(np.int8(y[j]))
  data_path=os.path.join(path, label_path)
  cv2.imwrite(data_path+"/"+df["Jpeg_Name"][j],data[j])

In [8]:
from datasets import load_dataset 

# load a custom dataset from local/remote files or folders using the ImageFolder feature

# option 1: local/remote files (supporting the following formats: tar, gzip, zip, xz, rar, zstd)
dataset = load_dataset("imagefolder", data_dir="/content/data")

Resolving data files:   0%|          | 0/347 [00:00<?, ?it/s]

Using custom data configuration default-9e9fbc6b4a4eb2c3


Downloading and preparing dataset imagefolder/default to /root/.cache/huggingface/datasets/imagefolder/default-9e9fbc6b4a4eb2c3/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597...
                

Downloading data files #4:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/21 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/21 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/22 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/21 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/21 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/21 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /root/.cache/huggingface/datasets/imagefolder/default-9e9fbc6b4a4eb2c3/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
from PIL import Image
import torch
from torchvision import datasets, models, transforms
img_size=224
batch_size=32

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomAutocontrast(),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        transforms.Resize(size=(img_size,img_size))    
    ]),

    'val': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        transforms.Resize(size=(img_size,img_size))

    ]),
}



def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        data_transforms["train"](image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [data_transforms["val"](image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

splits = dataset["train"].train_test_split(test_size=0.2)
train_ds = splits['train']
val_ds = splits['test']

train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)



In [10]:
from transformers import ConvNextFeatureExtractor, ConvNextForImageClassification, TrainingArguments, Trainer
import torch
from datasets import load_metric


model_name_or_path = 'facebook/convnext-tiny-224'
metric = load_metric("accuracy")
labels = train_ds.features['label'].names




feature_extractor = ConvNextFeatureExtractor.from_pretrained(model_name_or_path)
model = ConvNextForImageClassification.from_pretrained(model_name_or_path,    num_labels=len(labels),ignore_mismatched_sizes=True,
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)})

args = TrainingArguments(
    "Cvt-finetuned-thyroid",
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=150,
    warmup_ratio=0.1,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=50
)

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}



trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/266 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/68.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/109M [00:00<?, ?B/s]

Some weights of ConvNextForImageClassification were not initialized from the model checkpoint at facebook/convnext-tiny-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
train_results = trainer.train()
# rest is optional but nice to have
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 277
  Num Epochs = 150
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 300


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.660053,0.685714
1,No log,0.656371,0.7
2,No log,0.649863,0.742857
3,No log,0.640414,0.771429
4,No log,0.628597,0.8
5,No log,0.614682,0.814286
6,No log,0.598687,0.857143
7,No log,0.580857,0.9
8,No log,0.561867,0.914286
9,No log,0.542549,0.914286


***** Running Evaluation *****
  Num examples = 70
  Batch size = 32
Saving model checkpoint to Cvt-finetuned-thyroid/checkpoint-2
Configuration saved in Cvt-finetuned-thyroid/checkpoint-2/config.json
Model weights saved in Cvt-finetuned-thyroid/checkpoint-2/pytorch_model.bin
Feature extractor saved in Cvt-finetuned-thyroid/checkpoint-2/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 70
  Batch size = 32
Saving model checkpoint to Cvt-finetuned-thyroid/checkpoint-4
Configuration saved in Cvt-finetuned-thyroid/checkpoint-4/config.json
Model weights saved in Cvt-finetuned-thyroid/checkpoint-4/pytorch_model.bin
Feature extractor saved in Cvt-finetuned-thyroid/checkpoint-4/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 70
  Batch size = 32
Saving model checkpoint to Cvt-finetuned-thyroid/checkpoint-6
Configuration saved in Cvt-finetuned-thyroid/checkpoint-6/config.json
Model weights saved in Cvt-finetuned-thyroid/checkpoint-6/pytorch_model

***** train metrics *****
  epoch                    =      149.89
  total_flos               = 971859038GF
  train_loss               =      0.3924
  train_runtime            =  0:11:17.24
  train_samples_per_second =      61.352
  train_steps_per_second   =       0.443


In [12]:
metrics = trainer.evaluate()
# some nice to haves:
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 70
  Batch size = 32


***** eval metrics *****
  epoch                   =     149.89
  eval_accuracy           =     0.9143
  eval_loss               =     0.5619
  eval_runtime            = 0:00:00.40
  eval_samples_per_second =    173.666
  eval_steps_per_second   =      7.443


In [17]:
!pip install huggingface_hub



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
from huggingface_hub import notebook_login 
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [26]:
from huggingface_hub import Repository
repo = Repository( local_dir = 'SerdarHelli', clone_from='SerdarHelli/ThyroidTumorClassification' )


/content/SerdarHelli is already a clone of https://huggingface.co/SerdarHelli/ThyroidTumorClassification. Make sure you pull the latest changes with `repo.git_pull()`.


In [24]:
repo.git_pull("/content/")

In [27]:
%cp   /content/Cvt-finetuned-thyroid/all_results.json /content/SerdarHelli/all_results.json
%cp   /content/Cvt-finetuned-thyroid/eval_results.json /content/SerdarHelli/eval_results.json
%cp   /content/Cvt-finetuned-thyroid/train_results.json /content/SerdarHelli/train_results.json 
%cp   /content/Cvt-finetuned-thyroid/trainer_state.json /content/SerdarHelli/trainer_state.json
%cp   /content/Cvt-finetuned-thyroid/training_args.bin /content/SerdarHelli/training_args.bin


In [None]:
trainer.save_model("/content/SerdarHelli")

repo.push_to_hub("ThyroidTumorClassification")