In [1]:
!pip install python-docx pandas transformers



Подключение LLM

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Загружаем токенизатор и модель
# model_name = "smortlly/Mistral_Instr_first"
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Перемещаем модель на GPU, если доступно
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

Получение и обработка датасета

In [None]:
import os
import pandas as pd
from docx import Document

# Укажите пути к вашим папкам
# hmi_folder = 'путь_к_папке_HMI'
# ssts_folder = 'путь_к_папке_SSTS'

# TEST
hmi_folder = 'test/test data/HMI'
ssts_folder = 'test/test data/SSTS'

# Получаем списки файлов в каждой папке
hmi_files = os.listdir(hmi_folder)
ssts_files = os.listdir(ssts_folder)

# Функция для извлечения id из имени файла
def extract_id(filename, prefix):
    if filename.startswith(prefix):
        return filename[len(prefix):-5]  # Убираем префикс и '.docx'
    return None

# Функция для извлечения текста из файла .docx
def extract_text_from_docx(filepath):
    try:
        doc = Document(filepath)
        return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    except Exception as e:
        return f"Ошибка при чтении файла: {e}"

# Собираем все уникальные идентификаторы из обоих списков
hmi_ids = {extract_id(file, 'UC-') for file in hmi_files if extract_id(file, 'UC-')}
ssts_ids = {extract_id(file, 'SSTS-') for file in ssts_files if extract_id(file, 'SSTS-')}

all_ids = hmi_ids.union(ssts_ids)  # Все уникальные идентификаторы

# Составляем список с информацией о наличии файлов и их текстах
dataset = []
for file_id in all_ids:
    hmi_path = os.path.join(hmi_folder, f'UC-{file_id}.docx') if file_id in hmi_ids else None
    ssts_path = os.path.join(ssts_folder, f'SSTS-{file_id}.docx') if file_id in ssts_ids else None

    hmi_text = extract_text_from_docx(hmi_path) if hmi_path and os.path.exists(hmi_path) else 'Отсутствует'
    ssts_text = extract_text_from_docx(ssts_path) if ssts_path and os.path.exists(ssts_path) else 'Отсутствует'

    dataset.append((file_id, hmi_text, ssts_text))

# Создаем DataFrame для сохранения информации
df = pd.DataFrame(dataset, columns=['ID', 'HMI Text', 'SSTS Text'])

# Сохраняем DataFrame в CSV файл
df.to_csv('my_dataset.csv', index=False)

print("Датасет успешно создан и сохранен в my_dataset.csv")

Обработка

In [14]:
import pandas as pd

# Чтение CSV-файла
file_path = 'my_dataset.csv'  # Замените на путь к вашему файлу

# Вывод содержимого таблицы
import pandas as pd

# Загрузка данных из CSV в DataFrame
data = pd.read_csv(file_path)

# Парсинг ID, HMI и SSTS в список
parsed_list = []
for index, row in data.iterrows():
    parsed_entry = {
        "ID": row["ID"],
        "HMI": row["HMI Text"],
        "SSTS": row["SSTS Text"]
    }
    parsed_list.append(parsed_entry)

parsed_list[0]

{'ID': 31523,
 'HMI': '[I-31523]\xa0 Adding Internet Radio to Favorites List\nDescription: \nUse Case: Adding Internet Radio to Favorites List\n\nGoal: User wants to add favorites internet radio to enlighten the access to his favorite music/info\n\nActors:\nCar User \nIVI \nPreconditions:\nUser in in the Internet Radio list \nTriggers:\nUser selects to add the Radio via in_2/in_5 \nMain Scenario:\nThe user press on the UI interface "Add to Favorites" icon \nPostconditions:\nThe selected internet radio station is added to the user\'s personalized favorites list. \nRequirenments:\nUser must be able to navigate to the Favourite list from Internet Radio interface \nUser must be able to \'Remove from favorites\' the Internet Radio station by pressing the same interface icon ',
 'SSTS': 'Favorite Song operation\nFunctional Description\nUsers can set local music as favorites and view corresponding song information. This function is applicable to USB music, and online music. It can only be use

In [18]:
messages = []

prompt = """id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"""

for i in parsed_list:
    ID = i['ID']
    HMI = i['HMI']
    SSTS = i['SSTS']
    # Создайте сообщение в формате для модели
    message = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": f"""id: {ID} SSTS: {SSTS} UC: {HMI}"""}
    ]
    messages.append(message)


In [None]:
# Запустите обработку всех сообщений
results = []
for message in messages:
    result = chatbot(message, max_new_tokens=500)
    results.append(result)
# Сохраните результаты или выведите их
for result in all_results:
    print(result)


Сохранение результатов

In [4]:
import pandas as pd

# Разделяем строку на части, используя запятую как разделитель
data_parts = data_string.split(',')

# Создаем DataFrame с распарсенными данными
df = pd.DataFrame([data_parts], columns=['Number', 'Name', 'Description', 'Differences', 'Compliance level'])

# Сохраняем DataFrame в CSV файл
df.to_csv('submission.csv', index=False)

print("Данные успешно распарсены и сохранены в submission.csv")

Данные успешно распарсены и сохранены в parsed_data.csv


In [1]:

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
# model_name = "smortlly/Mistral_Instr_first"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Перемещаем модель на GPU, если доступно
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
import torch.nn as nn

class ClassificationHead(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(ClassificationHead, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size), # Линейный слой (можно добавить больше)
            nn.ReLU(),                          # Функция активации
            nn.Linear(hidden_size, hidden_size), # Линейный слой (можно добавить больше)
            nn.ReLU(), 
            nn.Dropout(0.1),                   # Dropout для регуляризации (опционально)
            nn.Linear(hidden_size, num_classes) # Выходной линейный слой
        )

    def forward(self, penultimate_hidden_state):
        # hidden_states: (batch_size, sequence_length, hidden_size)

        # Мы можем использовать различные стратегии для агрегации hidden_states по sequence_length:


        # 2. Использование только последнего hidden state в последовательности (CLS токен):
        # pooled_hidden = penultimate_hidden_state.mean(dim=1)
        # pooled_hidden = pooled_hidden.view(-1, 4096)


        # 3.  Более сложные стратегии, например, attention механизм.

        logits = self.classifier(penultimate_hidden_state) # (batch_size, num_classes)
        return logits

In [3]:
tokenizer.pad_token = tokenizer.eos_token

In [4]:
import pandas as pd

# Чтение CSV-файла
file_path = 'my_dataset.csv'  # Замените на путь к вашему файлу

# Вывод содержимого таблицы
import pandas as pd

# Загрузка данных из CSV в DataFrame
data = pd.read_csv(file_path)

# Парсинг ID, HMI и SSTS в список
parsed_list = []
for index, row in data.iterrows():
    parsed_entry = {
        "ID": row["ID"],
        "HMI": row["HMI Text"],
        "SSTS": row["SSTS Text"]
    }
    parsed_list.append(parsed_entry)

In [5]:
#             # Сообщение в формате для модели
messages = []

prompt = """id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"""

for i in parsed_list:
    ID = i['ID']
    HMI = i['HMI']
    SSTS = i['SSTS']
    # Создайте сообщение в формате для модели
    #message = f'\{"role": "system", "content": {prompt}\}, \{"role": "user", "content": f"""id: \{ID\} SSTS: {SSTS} UC: {HMI}"""\}'
    message = f'[{{"role": "system", "content": "{prompt}"}}, {{"role": "user", "content": f"id: {ID} SSTS: {SSTS} UC: {HMI}"}}]'
    messages.append(message)


In [6]:
messages[11]

'[{"role": "system", "content": "id: (It may not be)\nid: (ID may be absent)\nName: name\nText: \nSSTS:\nText1\nText:\nHMI or UC:\nText2\n\nBased on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:\n- FC: Perfect! Nothing can be improved.\n- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.\n- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.\n- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.\n- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).\n\nOutput differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:\nid, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 25957 SSTS: Mute or pause function\n

In [15]:
hidden_size = 4096 # Замените на размер hidden_state вашей модели
num_classes = 5
classification_head = ClassificationHead(hidden_size, num_classes).to(device)
classification_head.load_state_dict(torch.load('classifier.pth'))

  classification_head.load_state_dict(torch.load('classifier.pth'))


<All keys matched successfully>

In [18]:
import pandas as pd

df = pd.DataFrame([], columns=['Number', 'Name', 'Description', 'Differences', 'Compliance level'])

# Сохраняем DataFrame в CSV файл
df.to_csv('submission.csv', index=False)

print("Данные успешно распарсены и сохранены в submission.csv")

Данные успешно распарсены и сохранены в submission.csv


In [16]:
data_text = []
data_lvl = []

In [17]:
import torch.nn.functional as F
import numpy as np

for i in range(0, len(messages)):
    inputs = tokenizer(messages[i], return_tensors="pt").to(device)
    
    # Генерируем текст с выводом hidden_states
    with torch.no_grad(): # отключаем вычисление градиентов, если не нужно для обучения
        outputs = model.generate(**inputs, 
                               max_new_tokens=500, 
                               output_hidden_states=True, 
                               return_dict_in_generate=True,
                               )
    
    # Доступ к сгенерированному тексту
    generated_ids = outputs.sequences
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print("Сгенерированный текст:", generated_text)
    data_text.append(generated_text)
    
    
    # Доступ к hidden_states
    all_hidden_states = outputs.hidden_states
    penultimate_hidden_state = all_hidden_states[-1]
    print(penultimate_hidden_state[0].shape)
    
    tensor_penultimate_hidden_state = torch.cat(penultimate_hidden_state, dim=1)
    print(tensor_penultimate_hidden_state.shape)
    
    
    mean_penultimate_hidden_state = tensor_penultimate_hidden_state.mean(dim=1)
    
    mean_penultimate_hidden_state.shape
    
    logits = classification_head(mean_penultimate_hidden_state)
    print(logits.shape) # Output: torch.Size([32, 5])
    print(logits)
    
    probabilities = F.softmax(logits, dim=-1).to("cpu").detach().numpy()
    
    probabilities
    
    predicted_class = np.argmax(probabilities)
    predicted_class
    
    label2id = {"NA": 0, "NC": 1, "PC": 2, "LC":3, "FC":4} # замените на свои метки
    id2label = {0: "NA", 1: "NC", 2: "PC", 3:"LC", 4:"FC"}
    
    print(id2label[predicted_class])
    data_lvl.append(id2label[predicted_class])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 31523 SSTS: Favorite Song operat

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 8604 SSTS: Switch music sources


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 6583 SSTS: Make a call (B sample

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 28561 SSTS: hotspot settings
Fun

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 26161 SSTS: Automatic search
Fun

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 30371 SSTS: ERA Self-diagnosis
F

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 8800 SSTS: Receiving Call Notifi

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 26771 SSTS: 
Turn on and off hot

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 8692 SSTS: Manual dialing E-CALL

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 26160 SSTS: Отсутствует UC: [I-2

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Сгенерированный текст: [{"role": "system", "content": "id: (It may not be)
id: (ID may be absent)
Name: name
Text: 
SSTS:
Text1
Text:
HMI or UC:
Text2

Based on these texts, it is necessary to find the differences between them and assess their similarity on the following scale:
- FC: Perfect! Nothing can be improved.
- LC: Generally correct. Some improvement may be needed (described in comments). No need for review.
- PC: Major deviations. Improvements needed (described in comments). After the improvement, review is required.
- NC: Not compliant. Needs to be redone and re-reviewed. Directions for update shown in comments.
- NA: Not applicable. Reason for non-applicability is described in comments (corresponds to numbers from 5 to 1).

Output differences for all statuses except FC, and descriptions for all statuses except NA. Present the result in the format:
id, Name, Differences, Description, status (only 2 letters)"}, {"role": "user", "content": f"id: 11467 SSTS: Users can remotely c

In [21]:
import re

def delete_trash(generated_text):
    return re.sub(r'\[\{"(.*?)"\}\]', '', generated_text)

# Функция для извлечения текста между метками
def extract_between_labels(text, start_label, end_label):
    pattern = rf'{re.escape(start_label)}(.*?){re.escape(end_label)}'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

# Функция для извлечения текста после метки
def extract_after_label(text, label):
    pattern = rf'{re.escape(label)}(.*)'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

# Пример извлечениея текста между метками и после метки
# id_content = extract_between_labels(text, "id:", "Name:")
# name_content = extract_between_labels(text, "Name:", "Differences:")
# differences_content = extract_between_labels(text, "Differences:", "Description:")
# description_content = extract_after_label(text, "Description:")


In [28]:
def add_to_csv(id_content, name_content, differences_content, description_content, compliance_level, file_path):
    # Создание DataFrame с новыми данными
    data = {
        "Number": [id_content],
        "Name": [name_content],
        "Difference": [differences_content],
        "Description": [description_content],
        "Compliance Level": [compliance_level]
    }
    df = pd.DataFrame(data)

    # Проверка существования файла и добавление данных
    try:
        # Если файл существует, загружаем его и добавляем новые данные
        existing_df = pd.read_csv(file_path)
        updated_df = pd.concat([existing_df, df], ignore_index=True)
    except FileNotFoundError:
        # Если файл не найден, создаем новый DataFrame
        updated_df = df

    # Сохранение обновленного DataFrame в CSV
    updated_df.to_csv(file_path, index=False)

In [32]:
for i in range(0, len(messages)):
    text = delete_trash(data_text[i])
    id_content = extract_between_labels(text, "id:", "Name:")
    name_content = extract_between_labels(text, "Name:", "Differences:")
    differences_content = extract_between_labels(text, "Differences:", "Description:")
    description_content = extract_between_labels(text, "Description:", "status:")
    description_content = extract_after_label(text, "status:")

    add_to_csv(id_content, name_content, differences_content, description_content, status, 'submission.csv')
    