# PyTorch CNN for image classification

### PyTorch nightly view was used in the project due to the lack of support for apple silicon of the regular version.
download link: https://pytorch.org/get-started/locally/
- MPS acceleration is available on MacOS 12.3+
- pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu

The project requires training data to be launched, which should be in the `data/train_set/*` folder, the data can be downloaded from [ING Challenge Rocket](https://challengerocket.com/hacking/resources#go-pagecontent)

In [None]:
import os

import pandas as pd
from PIL import Image
import glob
import pytesseract
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

Check Apple Silicon architecture

In [None]:
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())

## Upload training files

In [None]:
main_folder = os.path.join('..', 'data', 'train_set')

image_paths = []
labels = []

for folder_name in os.listdir(main_folder):
    folder_path = os.path.join(main_folder, folder_name)

    if os.path.isdir(folder_path):
        image_files = glob.glob(os.path.join(folder_path, '*.tiff'))

        image_paths.extend(image_files)
        labels.extend([folder_name] * len(image_files))

train_source_df = pd.DataFrame({'image_path': image_paths, 'label': labels})

source_df = train_source_df

## Setting labels

In [None]:
labels_dict = {
    "advertisement": 0,
    "budget": 1,
    "email": 2,
    "file_folder": 3,
    "form_folder": 3,
    "form": 4,
    "handwritten": 5,
    "invoice": 6,
    "letter": 7,
    "memo": 8,
    "news_article": 9,
    "news_report": 9,
    "pit37_v1": 10,
    "pozwolenie_uzytkowanie_obiektu_budowlanego": 11,
    "presentation": 12,
    "questionnaire": 13,
    "resume": 14,
    "scientific_publication": 15,
    "scientific_report": 16,
    "scientific_raport": 16,
    "specification": 17,
    "umowa_na_odleglosc_odstapienie": 18,
    "umowa_o_dzielo": 19,
    "umowa_sprzedazy_samochodu": 20
}

In [None]:
source_df['label'] = source_df['label'].apply(lambda x: labels_dict[x])
source_df.head()

## Define custom dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, transform_=None):
        self.data = dataframe
        self.transform = transform_

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image_path = self.data.iloc[index]['image_path']
        label = self.data.iloc[index]['label']

        image = Image.open(image_path).convert('L')

        if self.transform is not None:
            image = self.transform(image)

        return image, label

## Define two-layered CNN

In [None]:
class MyCNN(nn.Module):
    num_classes = 21

    def __init__(self):
        super().__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, 5),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, 5),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))

        self.fc = nn.Linear(119072, self.num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)

        return out

In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.GaussianBlur(kernel_size=3),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [None]:
batch_size = 32
num_epochs = 10
learning_rate = 0.001

In [None]:
dataset = CustomDataset(source_df, transform_=transform)

In [None]:
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
model = MyCNN()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

## Training the model

In [None]:
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (images, labels) in enumerate(dataloader):
        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if (i + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(dataloader)}], Loss: {running_loss / 10:.4f}')
            running_loss = 0.0

## Evaluating the model

In [None]:
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for images, labels in dataloader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on the test set: {accuracy * 100:.2f}%')

## Saving trained model

In [None]:
torch.save(model.state_dict(), os.path.join('..', 'models', 'model.pth'))

## Custom model

In [None]:
model_state_dict = model.state_dict()

model_cnn = MyCNN()
model_cnn.load_state_dict(model_state_dict)

transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

## Load vectorizer

In [None]:
path_model = os.path.join("..", "models", "vectorizer.pkl")
with open(path_model, 'wb') as file:
    tfidf_fitted = pickle.load(file)

## Prediction pipeline
If probability of prediction from Logistic Regression is over 0.9, the result is returned. Otherwise, the object is pipelined to CNN and a new prediction is returned.

In [None]:
def run_pipeline():
    file_path = input("Path to file to class predict: ")

    try:
        image_ = Image.open(file_path)
    except Exception as error:
        print("Error while reading the file:", str(error))

    ocr_result = pytesseract.image_to_string(image_, lang='eng+pol')

    vect_text = tfidf_fitted.transform([ocr_result])
    probabilites = model.predict_proba(vect_text)

    probabilites_index = probabilites.argmax()
    if probabilites[0][probabilites_index] > 0.9:
        return probabilites_index
    else:
        img = Image.open(file_path).convert('L')
        img = transform(img).unsqueeze(0)

        model_cnn.eval()
        with torch.no_grad():

            outputs = model_cnn(img)

        _, predicted = torch.max(outputs, 1)


        predicted_label = predicted.item()
        return predicted_label

In [None]:
idx = [*labels_dict.values()].index(run_pipeline())
print([*labels_dict.keys()][idx])