In [2]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import torch
import torchvision.transforms as transforms
from torchvision import models
import torch.nn as nn
import io

In [3]:
dataset_path = 'df_train_for_cv.csv'  # Замените на ваш путь к CSV-файлу
data = pd.read_csv(dataset_path)


print(data.head())

                                                 url  label
0  //avatars.mds.yandex.net/get-realty-offers/136...      5
1  //avatars.mds.yandex.net/get-realty-offers/988...      5
2  //avatars.mds.yandex.net/get-realty-offers/127...      5
3  //avatars.mds.yandex.net/get-realty-offers/138...      5
4  //avatars.mds.yandex.net/get-realty-offers/101...      5


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     3489 non-null   object
 1   label   3489 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 54.6+ KB


In [5]:
import pandas as pd

# Предположим, у вас есть DataFrame df с колонкой 'label', где указаны классы
# df = pd.DataFrame({'label': image_labels})

# Определяем классы, которые нужно уменьшить
classes_to_reduce = [4, 5]

# Устанавливаем коэффициент уменьшения
reduction_factor1 = 0.5  # Уменьшаем на 50%
reduction_factor2 = 0.7

# Создаем новый DataFrame для хранения сбалансированных данных
balanced_df = pd.DataFrame()

for class_label in classes_to_reduce:
    # Выбираем экземпляры текущего класса
    class_data = data[data['label'] == class_label]

    if class_label == 4:
        num_to_keep = int(len(class_data) * (1 - reduction_factor2))
    else:
        num_to_keep = int(len(class_data) * (1 - reduction_factor1))

    # Случайным образом выбираем экземпляры для сохранения
    reduced_class_data = class_data.sample(num_to_keep, random_state=42)  # random_state для воспроизводимости

    # Добавляем уменьшенные данные в новый DataFrame
    balanced_df = pd.concat([balanced_df, reduced_class_data])

# Добавляем экземпляры из других классов без изменений
other_classes = data[~data['label'].isin(classes_to_reduce)]
balanced_df = pd.concat([balanced_df, other_classes])

# Перемешиваем итоговый DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Проверяем количество экземпляров в каждом классе
print(balanced_df['label'].value_counts())

label
4    488
3    463
5    445
1    336
2    172
Name: count, dtype: int64


In [6]:
import pandas as pd
import concurrent.futures
import requests

def get_image(url):

    if ("https:" not in url):
        url = "https:" + url
    try:
        response = requests.get(url)
        response.raise_for_status()  # Проверка на ошибки
        return response.content  # Возвращаем содержимое изображения
    except Exception as e:
        print(f"Ошибка при загрузке {url}: {e}")
        return None

def load_images_with_labels(data, url_column='url', label_column='label', num_thread=5, batch_size=100):
    images = []
    image_labels = []

    urls = data[url_column].tolist()  # Получаем список URL из DataFrame
    labels = data[label_column].tolist() if label_column in data.columns else [None] * len(urls)  # Получаем список меток из DataFrame, если есть

    # Разбиваем на пакеты
    for i in range(0, len(urls), batch_size):
        batch_urls = urls[i:i + batch_size]
        batch_labels = labels[i:i + batch_size]

        with concurrent.futures.ThreadPoolExecutor(max_workers=num_thread) as executor:
            future_to_url = {executor.submit(get_image, url): (url, label) for url, label in zip(batch_urls, batch_labels)}

            for future in concurrent.futures.as_completed(future_to_url):
                url, label = future_to_url[future]
                try:
                    image = future.result()
                    if image:
                        images.append(image)
                        image_labels.append(label)
                except Exception as e:
                    print(f"Ошибка при обработке {url}: {e}")

    return images, image_labels


images, image_labels = load_images_with_labels(balanced_df)



In [7]:
data = balanced_df


In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from PIL import Image
import io
import numpy as np
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
import time
import gc
torch.cuda.empty_cache()
gc.collect()

25

In [17]:


transform = transforms.Compose([
    transforms.Resize((380, 380)),  # Оптимальный размер для EfficientNet-B4
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Класс для обработки данных
class ImageDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_bytes = self.images[idx]
        img = Image.open(io.BytesIO(img_bytes)).convert('RGB')
        label = self.labels[idx]

        if self.transform:
            img = self.transform(img)

        return img, label

# Применяем преобразования к изображениям
images_tensor = []
for img_bytes in images:
    img = Image.open(io.BytesIO(img_bytes)).convert('RGB')
    img_tensor = transform(img)
    images_tensor.append(img_tensor)

images_tensor = torch.stack(images_tensor)

In [23]:
num_classes = len(data['label'].unique())
image_labels = [label - 1 for label in image_labels]  # Уменьшение каждой метки на 1
image_labels_tensor = torch.tensor(image_labels, dtype=torch.long).unsqueeze(0).to(device)

# Создаем датасет и лоадер
dataset = ImageDataset(images, image_labels_tensor, transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

# Загрузка более мощной модели - EfficientNet-B4
model = models.efficientnet_b4(pretrained=True)

# Замена классификатора
in_features = model.classifier[1].in_features
model.classifier = nn.Sequential(
    nn.Dropout(p=0.5, inplace=True),
    nn.Linear(in_features, num_classes)
)

# Перенос модели на устройство
model = model.to(device)

Используемое устройство: cuda


In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True)

In [21]:
num_epochs = 10
best_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    start_time = time.time()

    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(dataset)
    scheduler.step(epoch_loss)

    # Сохраняем лучшую модель
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(model.state_dict(), 'best_efficientnet_model.pth')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Time: {time.time()-start_time:.1f}s')

# Загрузка лучшей модели для оценки
model.load_state_dict(torch.load('best_efficientnet_model.pth'))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
