In [13]:
from transformers import CLIPProcessor, CLIPModel
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from PIL import Image

class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, processor):
        self.texts = texts
        self.labels = labels
        self.processor = processor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label


In [14]:
# 加载 CLIP 模型和处理器
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

In [27]:
from sklearn.model_selection import train_test_split
import os
def build_dataset_from_folder(folder_path, processor):
    texts = []
    labels = []

    # 遍历文件夹中的子文件夹（假设每个子文件夹代表一个标签）
    for label, folder in enumerate(os.listdir(folder_path)):
        folder_path_full = os.path.join(folder_path, folder)

        # 确保是文件夹
        if os.path.isdir(folder_path_full):
            print(f"Processing folder: {folder}")

            # 遍历该文件夹中的所有文本文件
            for text_file in os.listdir(folder_path_full):
                text_file_path = os.path.join(folder_path_full, text_file)

                # 只处理文本文件（假设文件扩展名为 .txt）
                if text_file.endswith(".txt"):
                    print(f"Processing text file: {text_file_path}")

                    # 读取文本内容
                    with open(text_file_path, 'r') as f:
                        text = f.read()

                    # 将文本和标签添加到数据集中
                    texts.append(text)
                    labels.append(label)

    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42,shuffle=True
    )

    train_dataset = CustomTextDataset(train_texts, train_labels, processor)
    test_dataset = CustomTextDataset(test_texts, test_labels, processor)

    return train_dataset, test_dataset

    # 使用 CustomTextDataset 创建数据集
    # dataset = CustomTextDataset(texts, labels, processor)
    # return dataset

# 文件夹路径
folder_path = r"E:\DL\data\MP3\pre"  # 请替换为实际的文件夹路径

# 创建数据集
train_dataset , test_dataset= build_dataset_from_folder(folder_path, processor)

print(len(train_dataset))
print(len(test_dataset))
# 打印数据集中的第一个样本
# print(dataset[0])  # 输出处理后的文本和标签

Processing folder: 1_new_text
Processing text file: E:\DL\data\MP3\pre\1_new_text\1.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\10.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\100.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\101.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\102.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\103.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\104.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\105.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\106.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\107.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\108.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\109.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\11.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\110.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\111.txt
Processing text file: E:\DL\data\MP3\pre\1_new_text\112.txt
Processing tex

In [28]:
# # 假设有不定长文本和标签
# texts = ["This is a cat", "A dog is playing", "This is a car"]
# labels = [0, 1, 2]  # 假设 0, 1, 2 分别是不同的类别


# 创建自定义数据集
# dataset = CustomTextDataset(texts, labels, processor)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)



# 创建分类头部（线性层），假设有 N 个类别
num_classes = 4  # 计算类别数量
classification_head = nn.Linear(model.config.projection_dim, num_classes)

# 将模型和分类头部放到设备（GPU 或 CPU）
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
classification_head.to(device)

# 设置优化器
optimizer = torch.optim.Adam(list(model.parameters()) + list(classification_head.parameters()), lr=1e-5)


In [29]:
# 训练过程
epochs = 10
for epoch in range(epochs):
    model.train()
    classification_head.train()
    total_loss = 0
    for texts_batch, labels_batch in tqdm(train_dataloader):
        texts_batch = [text for text in texts_batch]  # 需要调整为列表格式
        labels_batch = torch.tensor(labels_batch).to(device)
        # print(labels_batch.shape)
        # 处理输入文本
        # inputs = processor(text=texts_batch, return_tensors="pt",max_length=77, padding=True).to(device)
        inputs = processor(text=texts_batch, padding=True, truncation=True, max_length=77, return_tensors="pt").to(device)

        # 获取 CLIP 输出（只使用文本编码器）
        outputs = model.get_text_features(**inputs)
        text_features = outputs  # 获取文本特征向量

        # 使用分类头部进行分类
        logits = classification_head(text_features)

        # 计算损失
        loss = nn.CrossEntropyLoss()(logits, labels_batch)
        total_loss += loss.item()

        # 反向传播并更新权重
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dataloader)}")


  labels_batch = torch.tensor(labels_batch).to(device)
100%|██████████| 87/87 [00:07<00:00, 10.88it/s]


Epoch 1/10, Loss: 0.3576029969871729


100%|██████████| 87/87 [00:07<00:00, 11.24it/s]


Epoch 2/10, Loss: 0.11465038596813021


100%|██████████| 87/87 [00:07<00:00, 11.23it/s]


Epoch 3/10, Loss: 0.07183174829630332


100%|██████████| 87/87 [00:07<00:00, 11.17it/s]


Epoch 4/10, Loss: 0.03461321513971378


100%|██████████| 87/87 [00:07<00:00, 11.17it/s]


Epoch 5/10, Loss: 0.06906674386954856


100%|██████████| 87/87 [00:07<00:00, 11.14it/s]


Epoch 6/10, Loss: 0.07290290915888958


100%|██████████| 87/87 [00:07<00:00, 11.13it/s]


Epoch 7/10, Loss: 0.03578350944551586


100%|██████████| 87/87 [00:07<00:00, 11.16it/s]


Epoch 8/10, Loss: 0.05210285521936656


100%|██████████| 87/87 [00:07<00:00, 11.13it/s]


Epoch 9/10, Loss: 0.05676059003105794


100%|██████████| 87/87 [00:07<00:00, 11.13it/s]

Epoch 10/10, Loss: 0.04599158184859773





In [44]:
from sklearn.metrics import accuracy_score

model.eval()  # 切换到评估模式

# 创建 DataLoader 用于测试集
from torch.utils.data import DataLoader

# 定义 batch size
batch_size = 16
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 评估模型
predictions = []
true_labels = []

## 不需要计算梯度
with torch.no_grad():
     for texts_batch, labels_batch in tqdm(test_dataloader):

        texts_batch = [text for text in texts_batch]  # 需要调整为列表格式
        labels_batch = torch.tensor(labels_batch).to(device)

        for label in labels_batch:
            true_labels.append(label.item())

        # print(labels_batch)

        inputs = processor(text=texts_batch, padding=True, truncation=True, max_length=77, return_tensors="pt").to(device)
        # 使用模型进行预测
        outputs = model.get_text_features(**inputs)

        # print(outputs)

        text_features = outputs

        # 使用分类头部进行推理
        logits = classification_head(text_features)


        # 获取每个样本的预测标签
        predicted_labels = torch.argmax(logits, dim=1)

        # 输出每个样本的预测标签
        for i, label in enumerate(predicted_labels):
            # print(f"样本 {i+1} 的预测标签: {label.item()}")
            predictions.append(label.item())
            # true_labels

        # print(predicted_labels)
        # print(true_labels)

        # print(logits.shape)
        # 获取最匹配的类别
        # predicted_label = torch.argmax(logits)

        # print(predicted_label)

        # break
        # # 假设模型输出的是特征，我们可以使用最大值索引来进行分类
        # predictions.extend(outputs.argmax(dim=1).cpu().numpy())  # 预测标签
        # true_labels.extend(labels_batch.cpu().numpy())  # 真实标签
        #
        # print(predictions)
        # print(true_labels)

# 计算准确度
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

  labels_batch = torch.tensor(labels_batch).to(device)
100%|██████████| 22/22 [00:00<00:00, 26.36it/s]

Test Accuracy: 92.82%





In [32]:
# 推理过程
new_text = "This is a new image of a cat"
inputs = processor(text=[new_text], return_tensors="pt", padding=True).to(device)

# 获取 CLIP 模型的文本特征
outputs = model.get_text_features(**inputs)
text_features = outputs

# 使用分类头部进行推理
logits = classification_head(text_features)

# 获取最匹配的类别
predicted_label = torch.argmax(logits)
print(f"预测标签: {predicted_label.item()}")


预测标签: 3


In [19]:
# 微调整个模型
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
