Notebook for inference model

В данном окружении необходим датасет и веса модели

In [1]:
!pip install -r requirements.txt



Read dataset
---

---

In [1]:
import pandas as pd
from docx import Document
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from transformers import LongformerTokenizer, LongformerForSequenceClassification
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
PATH_TO_EXCEL = "test data"
FOLDER_DOCX = "test data"

In [107]:
df = pd.DataFrame({"Number": [ 29448, 30364, 30365, 30370, 65831, 65832, 65833,  86921,
        88001, 88002,  114671, 259571, 259572, 261611, 315231]})

def read_docx(filename):
    try:
        doc = Document(filename)
        text = "\n".join([para.text for para in doc.paragraphs])
        text = text.split("\n")
        text = text[1:]
        return "\n".join(text)
    except:
        return "NAN_CODE"
    
def get_name(filename):
    try:
        doc = Document(filename)
        text = "\n".join([para.text for para in doc.paragraphs])
        text = text.split("\n")
        words = text[0].split()
        words = words[1:]
        return " ".join(words)
    except:
        return "NAN_CODE"

df['SSTS_Text'] = df['Number'].apply(lambda x: read_docx(f"./{FOLDER_DOCX}/HMI/UC-{x}.docx"))
df['HMI_Text'] = df['Number'].apply(lambda x: read_docx(f"./{FOLDER_DOCX}/SSTS/SSTS-{x}.docx"))
df['Name'] = df['Number'].apply(lambda x: get_name(f"./{FOLDER_DOCX}/HMI/UC-{x}.docx"))
df

Unnamed: 0,Number,SSTS_Text,HMI_Text,Name
0,29448,"Description: \nUse-Case Title: ""Configure heat...",users can set the battery keep warm mode on IV...,Configure heat preservation
1,30364,"Description: \nUse-Case Title: ""Stop charging ...",Functional Description\nUsers can set the maxi...,Stop charging when the cut-off SOC is reached
2,30365,"Description: \n\nUse-Case Title: ""Start the Ch...",Users can choose the starting charging or stop...,Start the Charging Process via Soft Switch
3,30370,"Description: \nUse Case: ""Stop the discharge p...",The user can select the V2L function on and of...,Stop the discharging process (Updated)
4,65831,Preconditions:\nThe IVI is on in_2 \nThe vehic...,Functional Description:\nUsers can dial throug...,Driver initiate a call through SWP
5,65832,Preconditions:\nThe IVI is on in_2 \nThe vehic...,Functional Description:\nUsers can dial throug...,Driver initiate a call through SWP
6,65833,Preconditions:\nThe IVI is on in_2 \nThe drive...,Functional Description:\nUsers can dial throug...,Driver initiate a call through SWP
7,86921,Preconditions:\nThe vehicle is in the Drive St...,Functional Description\n\n,Emergency Service Communication (ERA-Glonass)
8,88001,Preconditions\nEntertainment system is ready. ...,Functional Description:\nWhen a mobile phone c...,Receiving Call Notifications
9,88002,Preconditions\nEntertainment system is ready. ...,Functional Description:\nWhen a mobile phone c...,Receiving Call Notifications


In [108]:
df_nans = df[(df["SSTS_Text"] == "NAN_CODE") | (df["HMI_Text"] == "NAN_CODE")]
df = df[(df["SSTS_Text"] != "NAN_CODE") & (df["HMI_Text"] != "NAN_CODE")]

In [109]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

Predict Difference
---

---

In [110]:
model_path = './trained_model_diff2'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

In [111]:
def generate_difference_description(ssts_text, hmi_text):
    input_text = f"SSTS: {ssts_text} </s> HMI: {hmi_text}"
    inputs = tokenizer.encode(input_text, return_tensors='pt', truncation=True, max_length=1024).to(device)
    
    outputs = model.generate(
        inputs,
        max_length=150,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2
    )
    
    description = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return description

In [112]:
differences = []

for _, row in df.iterrows():
    predicted_difference = generate_difference_description(row["SSTS_Text"], row["HMI_Text"])
    differences.append(predicted_difference)

In [113]:
df['Differences'] = differences

Predict Description
---

---

In [114]:
model_path = './trained_model_desc2'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

In [115]:
descriptions = []

for _, row in df.iterrows():
    predicted_difference = generate_difference_description(row["SSTS_Text"], row["HMI_Text"])
    descriptions.append(predicted_difference)

In [116]:
df['Description'] = descriptions

Predict Level
---

---

In [117]:
labels = ['FC', 'LC', 'NC', 'PC']

le = LabelEncoder()
le.fit(labels)

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
label_mapping

{np.str_('FC'): np.int64(0),
 np.str_('LC'): np.int64(1),
 np.str_('NC'): np.int64(2),
 np.str_('PC'): np.int64(3)}

In [118]:
model_path = './trained_classifier_compl2'

tokenizer = LongformerTokenizer.from_pretrained(model_path)
model = LongformerForSequenceClassification.from_pretrained(model_path).to(device)

In [119]:
def predict_compliance_level(ssts_text, hmi_text):
    input_text = f"SSTS: {ssts_text} [SEP] HMI: {hmi_text}"
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding='max_length', max_length=1024).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax().item()
        
        predicted_label = le.inverse_transform([predicted_class_id])[0]
        return predicted_label

In [120]:
compl = []

for _, row in df.iterrows():
    compl.append(predict_compliance_level(row["SSTS_Text"], row["HMI_Text"]))

In [121]:
df["Complience Level"] = compl

Apply nans and union
---

---

In [90]:
df_nans["Differences"] = "ssts hasn't info about this"
df_nans["Description"] = "-"
df_nans["Complience Level"] = "NA"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nans["Differences"] = "ssts hasn't info about this"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nans["Description"] = "-"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nans["Complience Level"] = "NA"


In [103]:
df = pd.concat([df, df_nans])

In [122]:
df = df.sort_index()

In [123]:
df = df[["Number", "Name", "Differences", "Description", "Complience Level"]]

In [124]:
df

Unnamed: 0,Number,Name,Differences,Description,Complience Level
0,29448,Configure heat preservation,SSTS misses the following content: The battery...,"HMX describes: ""The power mode of the vehicle ...",LC
1,30364,Stop charging when the cut-off SOC is reached,UC misses the following content: The system no...,The user can operate the display; he/she selec...,LC
2,30365,Start the Charging Process via Soft Switch,SSTS misses the following content: The vehicle...,"The user presses the ""Start Charging"" soft swi...",LC
3,30370,Stop the discharging process (Updated),"SSTS describes that ""the vehicle discharge sys...",The user can select the V2L function on and of...,NC
4,65831,Driver initiate a call through SWP,SSTS describes that only when the vehicle is s...,Users can dial through IVI and make phone call...,NC
5,65832,Driver initiate a call through SWP,SSTS describes that users can dial through SWP...,Users can dial through IVI and make phone call...,NC
6,65833,Driver initiate a call through SWP,SSTS describes that users can dial through SWP...,The driver navigates to the 'Calls' option on ...,LC
7,86921,Emergency Service Communication (ERA-Glonass),SSTS misses the folowing content: The vehicle ...,The safety audio system automatically mutes or...,NC
8,88001,Receiving Call Notifications,SSTS misses the following content: The output ...,HMX description: The driver can accept or reje...,LC
9,88002,Receiving Call Notifications,SSTS misses the following content: The output ...,HMX description: The driver can accept or reje...,LC


Save to (.csv)
---

---

In [None]:
df.to_csv("submission.csv", index=False)