In [1]:
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from PIL import Image
from torchvision import transforms
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
from torchvision.io import read_image




In [2]:

# CSVファイルの読み込み
annotations = pd.read_csv("BananaDetectNew.csv", converters={"region_shape_attributes": eval})

annotations

Unnamed: 0,filename,file_size,file_attributes,region_count,region_id,region_shape_attributes,region_attributes
0,b1.JPG,1865803,{},1,0,"{'name': 'polygon', 'all_points_x': [1294, 114...","{""banana"":""banana""}"
1,b2.JPG,1443106,{},1,0,"{'name': 'polygon', 'all_points_x': [2607, 229...","{""banana"":""banana""}"
2,b3.JPG,1391833,{},0,0,{},{}
3,b5.JPG,1483898,{},0,0,{},{}
4,b6.JPG,1415255,{},1,0,"{'name': 'polygon', 'all_points_x': [1200, 101...","{""banana"":""banana""}"
...,...,...,...,...,...,...,...
85,n27.JPG,1519499,{},1,0,"{'name': 'polygon', 'all_points_x': [3543, 252...",{}
86,n28.JPG,1520143,{},1,0,"{'name': 'polygon', 'all_points_x': [1931, 163...",{}
87,n29.JPG,1605041,{},1,0,"{'name': 'polygon', 'all_points_x': [1950, 169...",{}
88,n30.JPG,1511000,{},1,0,"{'name': 'polygon', 'all_points_x': [2245, 162...",{}


In [3]:

# 画像ファイル名の列を取得
image_files = annotations["filename"]
region_raw_data=annotations["region_shape_attributes"]
print(region_raw_data)


0     {'name': 'polygon', 'all_points_x': [1294, 114...
1     {'name': 'polygon', 'all_points_x': [2607, 229...
2                                                    {}
3                                                    {}
4     {'name': 'polygon', 'all_points_x': [1200, 101...
                            ...                        
85    {'name': 'polygon', 'all_points_x': [3543, 252...
86    {'name': 'polygon', 'all_points_x': [1931, 163...
87    {'name': 'polygon', 'all_points_x': [1950, 169...
88    {'name': 'polygon', 'all_points_x': [2245, 162...
89    {'name': 'polygon', 'all_points_x': [2861, 252...
Name: region_shape_attributes, Length: 90, dtype: object


In [4]:
print(region_raw_data[0]['all_points_x'])
print(region_raw_data[0]['all_points_y'])

[1294, 1149, 1433, 1500, 1089, 1258, 1784, 2135, 1954, 1585]


In [5]:
# バウンディングボックス情報の取得
bounding_boxes = []
for shape_attr in region_raw_data:
    if shape_attr:  # {}空でない場合のみ処理を行う
        all_points_x = shape_attr["all_points_x"]  # キーが存在しない場合は空のリストを返す
        all_points_y = shape_attr["all_points_y"]  # キーが存在しない場合は空のリストを返す
        bounding_boxes.append((all_points_x, all_points_y))
    else:
        bounding_boxes.append(([0,0,0,0,0,0,0,0,0,0], [0,0,0,0,0,0,0,0,0,0,]))  # 欠損値の場合は空のリストを追加

print(image_files) #これを使う
print(bounding_boxes)


0      b1.JPG
1      b2.JPG
2      b3.JPG
3      b5.JPG
4      b6.JPG
       ...   
85    n27.JPG
86    n28.JPG
87    n29.JPG
88    n30.JPG
89    n31.JPG
Name: filename, Length: 90, dtype: object
[([1294, 1149, 1433, 1500, 1089, 1258, 1784, 2135, 1954, 1585], [623, 762, 1131, 2014, 2915, 3048, 2764, 1996, 1185, 907]), ([2607, 2292, 2359, 1627, 284, 550, 1597, 2516, 2933, 2619], [569, 623, 1125, 1863, 2171, 2637, 2667, 2195, 1373, 1077]), ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), ([1200, 1017, 1052, 1283, 1780, 2448, 3080, 3086, 2193, 1431], [1413, 1537, 1833, 2223, 2554, 2637, 2507, 2051, 2057, 1638]), ([1561, 1519, 1821, 2229, 2796, 3695, 3654, 2962, 2229, 1655], [2022, 2152, 2235, 1910, 1620, 1454, 1147, 946, 1129, 1667]), ([1117, 1212, 1620, 1981, 2406, 2607, 2572, 2211, 1561, 1188], [1555, 1744, 1579, 1525, 1638, 1608, 1413, 1129, 1171, 1354]), ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 

In [9]:
# x,yの最小値、最大値を格納したバウンディングボックスをつくる
bounding_boxes_minmax=[]
for bounding_box in bounding_boxes:
    box_x=bounding_box[0]
    box_y=bounding_box[1]

    # バウンディングボックスの形式に変換 [xmin, ymin, xmax, ymax]
    xmin = min(box_x)
    ymin = min(box_y)
    xmax = max(box_x)
    ymax = max(box_y)
    bounding_boxes_minmax.append([xmin, ymin, xmax, ymax])
print(bounding_boxes_minmax) #これを使う？

[[1089, 623, 2135, 3048], [284, 569, 2933, 2667], [0, 0, 0, 0], [0, 0, 0, 0], [1017, 1413, 3086, 2637], [1519, 946, 3695, 2235], [1117, 1129, 2607, 1744], [0, 0, 0, 0], [1649, 384, 3086, 1898], [881, 863, 2702, 2111], [0, 0, 0, 0], [792, 1657, 2589, 2431], [1070, 1188, 2921, 2401], [1082, 928, 3051, 2057], [0, 0, 0, 0], [479, 514, 1762, 1519], [1862, 1188, 2601, 1892], [1041, 715, 3725, 2554], [0, 0, 0, 0], [656, 1106, 2944, 2578], [1276, 1077, 2050, 2643], [429, 1385, 2353, 3260], [1143, 738, 2262, 3405], [0, 0, 0, 0], [907, 1022, 1851, 3260], [1023, 1117, 3624, 2247], [0, 0, 0, 0], [2140, 225, 3512, 2448], [0, 0, 0, 0], [1077, 599, 1978, 2431], [733, 1017, 3287, 1992], [1685, 1419, 2607, 1821], [1709, 1431, 2489, 2241], [1413, 733, 2383, 1833], [1519, 603, 3482, 2530], [0, 0, 0, 0], [1638, 745, 3169, 2341], [1348, 1117, 2968, 2252], [1295, 1469, 2332, 2057], [625, 1863, 2339, 2754], [956, 1518, 1724, 3574], [865, 1137, 2038, 4016], [0, 0, 0, 0], [0, 0, 0, 0], [934, 378, 3920, 1561], 

In [10]:
class BananaDataset(Dataset):
    def __init__(self, image_files, bounding_boxes_minmax, transform=None):
        self.image_files = image_files
        self.bounding_boxes_minmax = bounding_boxes_minmax
        self.transform = transform
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_files[idx]).convert("RGB")
        boxes = self.bounding_boxes_minmax[idx]  # bounding_boxesはすでに各画像に対するバウンディングボックスのリスト
       
        
        if self.transform:
            image = self.transform(image)
        
        return image, boxes
    
    


# バナナ画像が格納されているディレクトリ
data_dir = "bananas_images_train"
# 画像ファイル名をフルパスに変換するヘルパー関数
def get_image_path(filename):
    return data_dir+'/'+filename

# データセットの作成
transform = transforms.Compose([transforms.ToTensor()])
dataset = BananaDataset([get_image_path(filename) for filename in image_files], bounding_boxes_minmax, transform)


# データローダーの作成
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

# モデルのロード
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [11]:

for images, targets in dataset:
    print(targets)


[1089, 623, 2135, 3048]
[284, 569, 2933, 2667]
[0, 0, 0, 0]
[0, 0, 0, 0]
[1017, 1413, 3086, 2637]
[1519, 946, 3695, 2235]
[1117, 1129, 2607, 1744]
[0, 0, 0, 0]
[1649, 384, 3086, 1898]
[881, 863, 2702, 2111]
[0, 0, 0, 0]
[792, 1657, 2589, 2431]
[1070, 1188, 2921, 2401]
[1082, 928, 3051, 2057]
[0, 0, 0, 0]
[479, 514, 1762, 1519]
[1862, 1188, 2601, 1892]
[1041, 715, 3725, 2554]
[0, 0, 0, 0]
[656, 1106, 2944, 2578]
[1276, 1077, 2050, 2643]
[429, 1385, 2353, 3260]
[1143, 738, 2262, 3405]
[0, 0, 0, 0]
[907, 1022, 1851, 3260]
[1023, 1117, 3624, 2247]
[0, 0, 0, 0]
[2140, 225, 3512, 2448]
[0, 0, 0, 0]
[1077, 599, 1978, 2431]
[733, 1017, 3287, 1992]
[1685, 1419, 2607, 1821]
[1709, 1431, 2489, 2241]
[1413, 733, 2383, 1833]
[1519, 603, 3482, 2530]
[0, 0, 0, 0]
[1638, 745, 3169, 2341]
[1348, 1117, 2968, 2252]
[1295, 1469, 2332, 2057]
[625, 1863, 2339, 2754]
[956, 1518, 1724, 3574]
[865, 1137, 2038, 4016]
[0, 0, 0, 0]
[0, 0, 0, 0]
[934, 378, 3920, 1561]
[822, 1194, 3139, 2270]
[0, 0, 0, 0]
[865, 104

In [12]:
for images, boxes in data_loader:
    print(boxes)

[tensor([0]), tensor([0]), tensor([0]), tensor([0])]
[tensor([674]), tensor([1077]), tensor([3335]), tensor([1991])]
[tensor([907]), tensor([1022]), tensor([1851]), tensor([3260])]
[tensor([1519]), tensor([946]), tensor([3695]), tensor([2235])]
[tensor([1143]), tensor([738]), tensor([2262]), tensor([3405])]
[tensor([66]), tensor([1334]), tensor([2662]), tensor([2300])]
[tensor([0]), tensor([0]), tensor([0]), tensor([0])]
[tensor([1295]), tensor([1469]), tensor([2332]), tensor([2057])]
[tensor([797]), tensor([302]), tensor([2354]), tensor([3265])]
[tensor([1111]), tensor([863]), tensor([3754]), tensor([1945])]
[tensor([471]), tensor([332]), tensor([2662]), tensor([2034])]
[tensor([604]), tensor([851]), tensor([2668]), tensor([2306])]
[tensor([792]), tensor([1657]), tensor([2589]), tensor([2431])]
[tensor([479]), tensor([514]), tensor([1762]), tensor([1519])]
[tensor([362]), tensor([855]), tensor([3887]), tensor([2317])]
[tensor([934]), tensor([378]), tensor([3920]), tensor([1561])]
[ten

In [13]:
# デバイスの指定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# オプティマイザの設定
optimizer = torch.optim.SGD(model.parameters(), lr=0.005)

# 学習のループ
for image_tensors, targets in data_loader:
    image_tensors = list(image.to(device) for image in image_tensors)

    # ターゲットの形式を正しく組み立てる
    target_list = []

    # ターゲットにバウンディングボックスが存在しない場合
    if any(x == 0 for x in targets):
        # デフォルトのバウンディングボックスをバインディングボックスとして渡す
        # ここでは、適当な非ゼロ座標を設定します
       continue;
    else:
        # バウンディングボックスをテンソルに変換
        target_tensor = {
            "boxes": torch.tensor([targets], dtype=torch.float32, device=device),
            "labels": torch.tensor([1], dtype=torch.int64),
            "image_id": torch.tensor([[]], dtype=torch.int64),
            "area": torch.tensor([[]], dtype=torch.float32),
            "iscrowd": torch.tensor([[]], dtype=torch.int64),
        }
        target_list.append(target_tensor)

#     print(targets)
#     print("------")
#     このモデルは物体が存在しない画像はのぞいて学習させた
    optimizer.zero_grad()
    loss_dict = model(image_tensors, target_list)
    losses = sum(loss for loss in loss_dict.values())
    losses.backward()
    optimizer.step()

    print(target_tensor)

{'boxes': tensor([[1295., 1469., 2332., 2057.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'area': tensor([], size=(1, 0)), 'iscrowd': tensor([], size=(1, 0), dtype=torch.int64)}
{'boxes': tensor([[ 638., 1131., 3493., 2095.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'area': tensor([], size=(1, 0)), 'iscrowd': tensor([], size=(1, 0), dtype=torch.int64)}
{'boxes': tensor([[1348., 1117., 2968., 2252.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'area': tensor([], size=(1, 0)), 'iscrowd': tensor([], size=(1, 0), dtype=torch.int64)}
{'boxes': tensor([[ 683.,  710., 3919., 2005.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'area': tensor([], size=(1, 0)), 'iscrowd': tensor([], size=(1, 0), dtype=torch.int64)}
{'boxes': tensor([[ 797.,  302., 2354., 3265.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'ar

{'boxes': tensor([[ 733., 1017., 3287., 1992.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'area': tensor([], size=(1, 0)), 'iscrowd': tensor([], size=(1, 0), dtype=torch.int64)}
{'boxes': tensor([[ 822., 1194., 3139., 2270.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'area': tensor([], size=(1, 0)), 'iscrowd': tensor([], size=(1, 0), dtype=torch.int64)}
{'boxes': tensor([[ 747., 1072., 3303., 1914.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'area': tensor([], size=(1, 0)), 'iscrowd': tensor([], size=(1, 0), dtype=torch.int64)}
{'boxes': tensor([[ 796.,  330., 3263., 1457.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'area': tensor([], size=(1, 0)), 'iscrowd': tensor([], size=(1, 0), dtype=torch.int64)}
{'boxes': tensor([[ 960.,  247., 2306., 3833.]]), 'labels': tensor([1]), 'image_id': tensor([], size=(1, 0), dtype=torch.int64), 'ar

In [14]:
import torchvision.transforms as T
from PIL import Image, ImageDraw

# 1. 画像の前処理
def preprocess_image(image):
    transform = T.Compose([
        T.ToTensor(),  # 画像をテンソルに変換
    ])
    return transform(image).unsqueeze(0)  # バッチ次元を追加

# 2. モデルに画像を渡して推論
def predict(model, image_tensor):
    model.eval()
    with torch.no_grad():
        image_tensor = image_tensor.to(device)
        predictions = model(image_tensor)

    return predictions

# 3. バウンディングボックスを描画
def draw_boxes(image, predictions, threshold=0.5):
    draw = ImageDraw.Draw(image)
    for score, label, box in zip(predictions['scores'], predictions['labels'], predictions['boxes']):
        if score > threshold:
            box = [round(i, 2) for i in box.tolist()]  # バウンディングボックスの座標を整数に変換
            draw.rectangle(box, outline="red", width=3)
    
    return image

# 画像ファイルの読み込み
image_path = 'bananas_images_test/b32.JPG'
image = Image.open(image_path)

# 画像の前処理
image_tensor = preprocess_image(image)

# モデルに画像を渡して推論
predictions = predict(model, image_tensor)

# バウンディングボックスを描画
result_image = draw_boxes(image.copy(), predictions[0], threshold=0.5)

# 結果の画像を表示
result_image.show()





In [15]:
import pickle

# with open('bananaDetect1.pkl','wb') as f:
#     pickle.dump(model,f)

with open('bananaDetect3.pkl','wb') as f:
    pickle.dump(model,f)

#モデル性能向上に必要なこと 
1. batch_sizeを大きくする 
2. バナナが存在しない画像も学習できるようにする
3. バナナ画像のバリデーションを増やす 
4. 学習させる画像を増やす