In [0]:
import torch
from torchvision import transforms
import torch.nn.functional as F
import torch.nn as nn

from PIL import Image
import imageio
import os

from google.colab import drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
class YOLO(nn.Module):
    """neural network architectur inspired by YOLO V1"""
    
    def __init__(self, col_size):
        super(YOLO, self).__init__()
        self.col_size = col_size
        self.conv1 = nn.Conv2d(1, 16, 7, stride=2)
        self.mp1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, (3, 3), stride=1)
        self.mp2 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(32, 64, (3, 3), stride=1)
        self.mp3 = nn.MaxPool2d(2, 2)

        self.fc1 = nn.Linear(64*5*104, 2048)
        self.fc2 = nn.Linear(2048, col_size * 3)
        self.dropout = nn.Dropout()

    def forward(self, x):
        # Conv + ReLU + max pooling for two layers
        x = F.relu(self.conv1(x))
        #print(x.size())
        x = self.mp1(x)
        #print(x.size())
        x = F.relu(self.conv2(x))
        #print(x.size())
        x = self.mp2(x)
        #print(x.size())
        x = F.relu(self.conv3(x))
        #print(x.size())
        x = self.mp3(x)
        #print(x.size())
        x = x.view(-1, 64*5*104)
        #print(x.size())
        x = F.relu(self.dropout(self.fc1(x)))
        #print(x.size())
        x = self.fc2(x)
        #print(x.size())
        x = x.view(-1, self.col_size, 3)
        #print(x.size())
        x = torch.sigmoid(x)
        return x

In [0]:
def calc_iou(box1, box2):
    """calculate iou for 2 boxes"""

    # calculate the coordinates of the insersection rectangle
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    # calculate the area of intersetion rectangle
    intersection = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)

    # calculate the area of the single boxes
    box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
    box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)

    iou = intersection / (box1_area + box2_area - intersection)

    return iou

In [0]:
def calc_x_y(col, tensor):
    """calculate x, y, width, height from output """
    
    x = tensor[1] * (1700 / 30) + col * (1700 / 30)
    width = tensor[2] * 1700
    return torch.FloatTensor([1, x, 0, width, 120])

In [0]:
def calc_mean_iou(output, target):
    """calculate iou for batch"""
    
    iou_list = []
    for i in range(output.size(0)):
        ious = []
        for j in range(output.size(1)):
            if (target[i][j][0] == 1):
                x_y_target = calc_x_y(j, target[i, j])
                x_y_output = calc_x_y(j, output[i, j])
                box1 = calc_box(x_y_target)
                box2 = calc_box(x_y_output)
                ious.append(calc_iou(box1, box2))
        iou_list.append(torch.mean(torch.stack(ious)))

    mean_iou = torch.mean(torch.stack(iou_list))
    return mean_iou

In [0]:
def calc_box(tensor):
    """calculate coordinates for the output image from tensor"""
    
    x1 = tensor[1]
    y1 = tensor[2]
    x2 = min(1700, x1 + tensor[3])
    y2 = min(200, y1 + tensor[4])
    box = [x1, y1, x2, y2]

    return box

In [0]:
def non_maximum_suppression(tensor, percent):
    """choose predicted lines by highest propability. 
    Lines who overlap a actual choosen line by percent or higher will delete."""
    
    for j in range(tensor.size(0)):
        if(tensor[j,0].item() < 0.2):
            tensor[j,0] = torch.tensor(0)
    found = []
    while(True):
        maximum = 0
        index = 0
        for j in range(tensor.size(0)):
            if(tensor[j,0].item() > maximum and j not in found):
                maximum = tensor[j,0].item()
                index = j

        if(maximum == 0):
            break

        found.append(index)
        tensor[index,0] = torch.tensor(1)
            
        for j in range(tensor.size(1)):
            if(j != index and tensor[j,0] >= 0.2):
                x_y_max = calc_x_y(index, tensor[index])
                x_y_other = calc_x_y(j, tensor[j])
                box1 = calc_box(x_y_max)
                box2 = calc_box(x_y_other)
                if(calc_iou(box1, box2) > percent):
                    tensor[j,0] = 0

In [108]:
imgs_path = "drive/My Drive/pred_words/lines/"
imgs_paths = os.listdir(imgs_path)
weight_path = "drive/My Drive/weights_lines_to_words.pt"
predict_path = "drive/My Drive/pred_words/preds"

transform = transforms.Compose([transforms.Resize((120, 1700)),
                                    transforms.ToTensor()])

# set a boolean flag that indicates whether a cuda capable GPU is available
is_gpu = torch.cuda.is_available()
print("GPU is available:", is_gpu)
print("If you are receiving False, try setting your runtime to GPU")

# set the device to cuda if a GPU is available
device = torch.device("cuda" if is_gpu else "cpu")
model = torch.load(weight_path)

print(model)

GPU is available: True
If you are receiving False, try setting your runtime to GPU
YOLO(
  (conv1): Conv2d(1, 16, kernel_size=(7, 7), stride=(2, 2))
  (mp1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (mp2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (mp3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=33280, out_features=2048, bias=True)
  (fc2): Linear(in_features=2048, out_features=90, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [0]:
def predict_words(model,imgs_path, predict_path):
    """ predict lines to words from image path to predict_path"""

    img_count = 0
    for path in imgs_paths:
        count = 0
        img_tensor = transform(Image.open(imgs_path + path))
        output = model(torch.stack([img_tensor]).to(device))[0]
        print(output)
        # find right boxes
        non_maximum_suppression(output, 0.35)
        img = imageio.imread(imgs_path + path)
        print(img.shape)
        y2 = img.shape[0]
        #print(output)
        xscale = img.shape[1] / 1700
        print(xscale, xscale)
        for i in range(30):
            if(output[i][0] > 0.3):
                #print(output[i])
                box = calc_box(calc_x_y(i, output[i]))
                x1 = round((int(box[0])) * xscale)
                x2 = round((int(box[2])) * xscale)
                print(box)
                print(round(x1), round(x2))
                imageio.imwrite(predict_path + "/pic" + str(img_count) + "line" + str(count) + '.jpg', img[0:y2, x1:x2])
                count += 1
        img_count += 1

In [114]:
predict_words(model, imgs_path, predict_path)

tensor([[9.9639e-01, 3.9334e-03, 3.2225e-02],
        [9.9555e-01, 9.0195e-01, 1.6002e-01],
        [1.8491e-03, 4.1281e-01, 1.2407e-01],
        [5.6295e-04, 3.9433e-01, 7.4753e-02],
        [4.5746e-03, 7.5465e-01, 4.3097e-02],
        [1.1855e-02, 5.8009e-01, 1.8902e-02],
        [2.5550e-02, 8.9468e-01, 3.1569e-02],
        [9.9144e-01, 6.1615e-02, 4.3593e-02],
        [1.3663e-03, 7.0785e-01, 2.4480e-02],
        [9.9822e-01, 2.9038e-01, 1.1361e-01],
        [3.2958e-04, 4.3468e-01, 7.0716e-02],
        [9.4243e-03, 6.1222e-01, 3.5910e-02],
        [8.5489e-03, 6.0740e-01, 4.3146e-02],
        [1.1715e-03, 8.2580e-01, 3.7833e-02],
        [9.6492e-01, 8.2186e-02, 5.5563e-02],
        [6.2499e-03, 6.5470e-01, 4.2356e-02],
        [3.4404e-02, 2.5040e-01, 4.7891e-02],
        [1.2384e-02, 5.0401e-01, 4.6076e-02],
        [9.9792e-01, 5.4326e-01, 2.2369e-01],
        [9.1372e-03, 4.8923e-01, 2.0709e-01],
        [2.0750e-05, 4.8442e-01, 9.7710e-02],
        [1.2070e-03, 5.8893e-01, 9