In [13]:
import dashscope
from http import HTTPStatus
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [14]:
# 设置 Dashscope API 密钥
dashscope.api_key = 'sk-aa60aeabc333475fa3679f899c509704'

# 嵌入函数
def embed_with_str(text):
    resp = dashscope.TextEmbedding.call(
        model=dashscope.TextEmbedding.Models.text_embedding_v3,
        input=text)
    if resp.status_code == HTTPStatus.OK:
        return resp.output['embeddings'][0]['embedding']
    else:
        raise Exception(f"Error in embedding: {resp.output}")

# 读取数据
# df_all = pd.read_excel('new_data_m_clean.xlsx')

# 定义文件路径
file_path = os.path.join('..', 'data', 'processed', 'use', 'df_boundary.csv')
df_all = pd.read_csv(file_path)

In [6]:
# 拆分数据集
train_df, temp_df = train_test_split(df_all, test_size=0.3, random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# 输出数据集大小
print(f"训练集大小: {train_df.shape}")
print(f"测试集大小: {test_df.shape}")
print(f"验证集大小: {val_df.shape}")

# 标签映射
labels = {1: 1, 0: 0}

训练集大小: (118, 2)
测试集大小: (25, 2)
验证集大小: (26, 2)


In [7]:
# 自定义数据集类
class CustomDataset(Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['boundary'].tolist()]
        self.texts = df['query'].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        embedding = embed_with_str(text)
        embedding_tensor = torch.tensor(embedding)
        batch_y = self.labels[idx]
        return embedding_tensor, batch_y

In [8]:
# 简化的分类器模型
class Classifier(nn.Module):
    def __init__(self, num_classes=1):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs):
        X = F.relu(self.fc1(inputs))
        X = self.dropout(X)
        X = F.relu(self.fc2(X))
        X = self.dropout(X)
        X = self.fc3(X)
        X = self.sigmoid(X)
        return X

In [10]:
# 训练模型函数
def train_model(model, train_dataloader, optimizer, criterion, device):
    model.to(device)
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    tk0 = tqdm(train_dataloader, smoothing=0, mininterval=1.0)
    for batch in tk0:
        optimizer.zero_grad()
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device).float().view(-1, 1)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        predicted = outputs.round()
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = total_loss / len(train_dataloader)
    avg_acc = total_correct / total_samples
    print(f'Average Loss: {avg_loss}, Average Accuracy: {avg_acc}')


def validate_model(model, dataloader, device):
    model.eval()  # 将模型设置为评估模式
    predictions = []
    true_labels = []
    with torch.no_grad():  # 在评估阶段不计算梯度
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device).float().view(-1, 1)
            outputs = model(inputs)
            predictions.extend(outputs.squeeze().cpu().numpy())  # 将预测结果保存到列表中
            true_labels.extend(labels.squeeze().cpu().numpy())  # 将真实标签保存到列表中

    # 计算AUC
    auc_score = roc_auc_score(true_labels, predictions)
    print(f'Validation AUC: {auc_score}')
    return auc_score



# 设置设备为 CPU
device = torch.device('cpu')


In [11]:
def main():
    train_dataset = CustomDataset(train_df)
    val_dataset = CustomDataset(val_df)
    test_dataset = CustomDataset(test_df)
    train_dataloader = DataLoader(train_dataset, batch_size=20, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=20)
    test_dataloader = DataLoader(test_dataset, batch_size=20)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = Classifier().to(device)  # 将模型移动到正确的设备上
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=0.001)

    epochs = 5
    for epoch in range(epochs):
        train_model(model, train_dataloader, optimizer, criterion, device)
        print(f'Epoch {epoch+1}')
        validate_model(model, val_dataloader, device)

    # 保存模型
    torch.save(model.state_dict(), 'classifier_bge1.5_20epoch.pth')
    print("模型已保存。")

In [12]:
if __name__ == "__main__":
    main()

100%|██████████| 6/6 [00:28<00:00,  4.83s/it]


Average Loss: 0.675589124361674, Average Accuracy: 0.6694915254237288
Epoch 1
Validation AUC: 0.9375


100%|██████████| 6/6 [00:25<00:00,  4.28s/it]


Average Loss: 0.5827447970708212, Average Accuracy: 0.9491525423728814
Epoch 2
Validation AUC: 0.9444444444444444


100%|██████████| 6/6 [00:25<00:00,  4.33s/it]


Average Loss: 0.40326161682605743, Average Accuracy: 0.9576271186440678
Epoch 3
Validation AUC: 0.9375


100%|██████████| 6/6 [00:25<00:00,  4.33s/it]


Average Loss: 0.23483262459437051, Average Accuracy: 0.9576271186440678
Epoch 4
Validation AUC: 0.9305555555555555


100%|██████████| 6/6 [00:25<00:00,  4.32s/it]


Average Loss: 0.1737640934685866, Average Accuracy: 0.9576271186440678
Epoch 5
Validation AUC: 0.9305555555555555


100%|██████████| 6/6 [00:27<00:00,  4.59s/it]


Average Loss: 0.17964830497900644, Average Accuracy: 0.9576271186440678
Epoch 6


KeyboardInterrupt: 

In [36]:
import dashscope
from http import HTTPStatus
import os
import json
dashscope.api_key = 'sk-aa60aeabc333475fa3679f899c509704'


def embed_with_str():
    resp = dashscope.TextEmbedding.call(
        model=dashscope.TextEmbedding.Models.text_embedding_v3,
        input='衣服的质量杠杠的，很漂亮，不枉我等了这么久啊，喜欢，以后还来这里买')
    if resp.status_code == HTTPStatus.OK:
        return(resp.output['embeddings'][0]['embedding'])
    else:
        return(resp.output)

In [37]:
embed_with_str()

[-0.06612585484981537,
 0.010662018321454525,
 -0.0845976322889328,
 0.004647048655897379,
 -0.015454590320587158,
 -0.010778436437249184,
 -0.031471870839595795,
 0.06787213683128357,
 -0.03859281912446022,
 0.05374665930867195,
 0.033373378217220306,
 0.06461241096258163,
 -0.024758389219641685,
 0.005069066770374775,
 0.013058303855359554,
 0.01985909976065159,
 0.02906588278710842,
 -0.041328661143779755,
 -0.01866580732166767,
 -0.032849494367837906,
 -0.05149589851498604,
 0.023749426007270813,
 -0.011845609173178673,
 -0.017608337104320526,
 0.0002340502105653286,
 0.03597339615225792,
 -0.01214635744690895,
 -0.006543704308569431,
 0.02237180434167385,
 -0.03477040305733681,
 -0.03521667420864105,
 0.09569621831178665,
 0.08894392848014832,
 -0.04877946153283119,
 -0.010254552587866783,
 -0.04276449233293533,
 -0.0062914639711380005,
 0.044665995985269547,
 -0.06845422834157944,
 0.025515111163258553,
 0.0183553583920002,
 0.01587175764143467,
 0.010293358936905861,
 0.00178023

In [33]:
kk['embeddings'][0]['embedding']

[-0.06612585484981537,
 0.010662018321454525,
 -0.0845976322889328,
 0.004647048655897379,
 -0.015454590320587158,
 -0.010778436437249184,
 -0.031471870839595795,
 0.06787213683128357,
 -0.03859281912446022,
 0.05374665930867195,
 0.033373378217220306,
 0.06461241096258163,
 -0.024758389219641685,
 0.005069066770374775,
 0.013058303855359554,
 0.01985909976065159,
 0.02906588278710842,
 -0.041328661143779755,
 -0.01866580732166767,
 -0.032849494367837906,
 -0.05149589851498604,
 0.023749426007270813,
 -0.011845609173178673,
 -0.017608337104320526,
 0.0002340502105653286,
 0.03597339615225792,
 -0.01214635744690895,
 -0.006543704308569431,
 0.02237180434167385,
 -0.03477040305733681,
 -0.03521667420864105,
 0.09569621831178665,
 0.08894392848014832,
 -0.04877946153283119,
 -0.010254552587866783,
 -0.04276449233293533,
 -0.0062914639711380005,
 0.044665995985269547,
 -0.06845422834157944,
 0.025515111163258553,
 0.0183553583920002,
 0.01587175764143467,
 0.010293358936905861,
 0.00178023

In [28]:
kkk2 = kk.output


In [29]:
kkk2['embeddings'][0]['embedding']

[-0.06612585484981537,
 0.010662018321454525,
 -0.0845976322889328,
 0.004647048655897379,
 -0.015454590320587158,
 -0.010778436437249184,
 -0.031471870839595795,
 0.06787213683128357,
 -0.03859281912446022,
 0.05374665930867195,
 0.033373378217220306,
 0.06461241096258163,
 -0.024758389219641685,
 0.005069066770374775,
 0.013058303855359554,
 0.01985909976065159,
 0.02906588278710842,
 -0.041328661143779755,
 -0.01866580732166767,
 -0.032849494367837906,
 -0.05149589851498604,
 0.023749426007270813,
 -0.011845609173178673,
 -0.017608337104320526,
 0.0002340502105653286,
 0.03597339615225792,
 -0.01214635744690895,
 -0.006543704308569431,
 0.02237180434167385,
 -0.03477040305733681,
 -0.03521667420864105,
 0.09569621831178665,
 0.08894392848014832,
 -0.04877946153283119,
 -0.010254552587866783,
 -0.04276449233293533,
 -0.0062914639711380005,
 0.044665995985269547,
 -0.06845422834157944,
 0.025515111163258553,
 0.0183553583920002,
 0.01587175764143467,
 0.010293358936905861,
 0.00178023

In [27]:
kk.output.embedding

AttributeError: 'dict' object has no attribute 'embedding'

In [20]:
# 解析 JSON 字符串
data = json.loads(embedding)

# 提取 embeddings
embeddings = data['output']['embeddings'][0]['embedding']

# 打印 embeddings
print(embeddings)

TypeError: the JSON object must be str, bytes or bytearray, not NoneType

In [None]:
embedding.

In [9]:
import pandas as pd
import numpy as np
import os

In [10]:
# 定义文件路径
file_path = os.path.join('..', 'data', 'raw', 'sample_data.xlsx')
df = pd.read_excel(file_path)

# 首先将日期列转换为日期格式
df['every_date'] = pd.to_datetime(df['every_date'])

# 按日期降序排序
df_sorted = df.sort_values(by='every_date', ascending=False)

# 去除重复值，保留日期最近的一行
df_unique = df_sorted.drop_duplicates(subset=['query', 'answer', 'boundary', 'correct'], keep='first').copy()

duplicates_removed = len(df) - len(df_unique)
# 打印去重后的数据框和去除的重复值数量
print(f"去除了 {duplicates_removed} 个重复值")

去除了 66 个重复值


In [11]:

# 分类----------------------------------------
# 计算每个 query 中 boundary 的众数
boundary_mode = df_unique.groupby('query')['boundary'].transform(lambda x: x.mode()[0])

# 标记 boundary 值等于众数的行为 1，其他为 0
df_unique.loc[:, 'boundary_mark'] = (df_unique['boundary'] == boundary_mode).astype(int)

# 提取 boundary_mark 为 1 的行
df_cat = df_unique[df_unique['boundary_mark'] == 1][['query', 'boundary']]

# 去重
df_cat = df_cat.drop_duplicates()

In [12]:

# 日志入库----------------------------------------
df_unique_right = df_unique.drop_duplicates(subset=['query', 'answer', 'correct'], keep='first')


def filter_by_correct(df, n):
    # 定义一个空列表来存储结果
    result = []

    # 按 query 分组
    grouped = df.groupby('query')

    for name, group in grouped:
        # 对于 correct 为 0 的行，保留最近的 n 行
        correct_0 = group[group['correct'] == 0].head(n)
        # 对于 correct 为 1 的行，保留最近的 n 行
        correct_1 = group[group['correct'] == 1].head(n)

        # 将结果添加到列表中
        result.append(correct_0)
        result.append(correct_1)

    # 将结果合并为一个数据框
    return pd.concat(result)


# 使用函数进行筛选
n = 3  # 例如，保留最近的 2 行
filtered_df = filter_by_correct(df_unique, n)

In [13]:

# 保存 df_cat 到 data/processed 文件夹
df_cat.to_csv(os.path.join('..', 'data', 'processed', 'df_boundary.csv'), index=False)

# 保存 filtered_df 到 data/processed 文件夹
filtered_df.to_csv(os.path.join('..', 'data', 'processed', 'df_right.csv'), index=False)



In [11]:
import pandas as pd
import numpy as np
import os

# 定义文件路径
file_path = os.path.join('..', 'data', 'raw', 'my_log_data.xlsx')
df = pd.read_excel(file_path)

# 首先将日期列转换为日期格式
df['every_date'] = pd.to_datetime(df['every_date'])

# 按日期降序排序
df_sorted = df.sort_values(by='every_date', ascending=False)

# 去除重复值，保留日期最近的一行
df_unique = df_sorted.drop_duplicates(subset=['query', 'answer', 'boundary', 'correct'], keep='first')

duplicates_removed = len(df) - len(df_unique)
# 打印去重后的数据框和去除的重复值数量
print(f"去除了 {duplicates_removed} 个重复值")

#分类----------------------------------------
# 计算每个 query 中 boundary 的众数
boundary_mode = df_unique.groupby('query')['boundary'].transform(lambda x: x.mode()[0])

# 标记 boundary 值等于众数的行为 1，其他为 0
df_unique['boundary_mark'] = (df_unique['boundary'] == boundary_mode).astype(int)

# 提取 boundary_mark 为 1 的行
df_cat = df_unique[df_unique['boundary_mark'] == 1][['query', 'boundary']]

# 去重
df_cat = df_cat.drop_duplicates()
# # 再去除冲突值(先标记再导出)
# df_unique['B_unique_count_per_A'] = df_unique.groupby('query')['category'].transform('nunique')
#
# # 标记违反规则的行：如果B_unique_count_per_A大于1，则标记为True
# df_unique['Violation'] = df_unique['B_unique_count_per_A'] > 1
#
# #导出文件，手工处理后再导入
# df_unique.to_excel("new_data_m_clean.xlsx",index = False)



去除了 1230 个重复值


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique['boundary_mark'] = (df_unique['boundary'] == boundary_mode).astype(int)


In [8]:
# 假设 df 是你的数据框
# 计算每个 query 中 boundary 的众数
boundary_mode = df.groupby('query')['boundary'].transform(lambda x: x.mode()[0])

# 标记 boundary 值等于众数的行为 1，其他为 0
df['boundary_mark'] = (df['boundary'] == boundary_mode).astype(int)


In [9]:

# 将 df_remove 保存到本地
output_path = os.path.join('..', 'data', 'processed', 'removed_data.xlsx')
df.to_excel(output_path, index=False)

print(f"去除的数据已保存到 {output_path}")

去除的数据已保存到 ..\data\processed\removed_data.xlsx
