In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from modelscope import snapshot_download
model_path = snapshot_download('01ai/Yi-1.5-9B-Chat')

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)


model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype='auto'
).eval()

In [None]:
# import our comment excel file to data frame format
import pandas as pd
df = pd.read_excel("comments_test.xlsx")
df.head()

In [None]:
# Tidy Data
comments = df['comments'].tolist()
comments = [comment for comment in comments if "NA" not in comment]
comments = [comment for comment in comments if comment.strip()]
comments = [comment.replace('\n', '') for comment in comments]
comments = [comment.strip() for comment in comments]
comments_clean = []
for i in comments:
    if i != '"':
        comments_clean.append(i)
comments = comments_clean
print(comments)

In [4]:

example = """
Comment: "I have purchased 2021 ID6 Crozz Long Range Pro model, and it has been more than 2 years since I bought the car. The car instrument and on-board Internet system have repeatedly failed."
Vehicle components: instrument cluster, Internet.
Comment: "However, in less than half a year, the driving assistance system malfunctioned again in November 2022. After inspection by the 4S dealership, it was still believed to be a steering wheel malfunction, and the steering wheel was replaced for the second time
Vehicle components: Assisted driving system
"""

In [None]:
# Main part for Yi
for comment in comments:
    prompt = f"""
    The following paragraph is the evaluation of new energy vehicle owners on their vehicles. 
    Your task is to extract intelligent components belonging to the three major fields of automobiles mentioned in {comment}, without the need to extract additional automotive components. 
    Here is an example{example}. 
    Please write in the form of 'comment: {comment}, vehicle component:'. If no vehicle component is recognized, output 'no component' directly. No explanation is needed.
    """

    messages = [
            {"role": "user", "content": prompt}
        ]

    input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors='pt')
    output_ids = model.generate(input_ids.to('cuda'), eos_token_id=tokenizer.eos_token_id, max_new_tokens=150)
    response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
    
    key_word = "Vehicle components"
    start_index = response.find(key_word)
    if start_index != -1:
        filt_response = response[start_index+len(key_word)+1:]
        end_index = filt_response.find('\n')
        if end_index != -1 & end_index != 0:
            filtered_response = filt_response[:end_index]
            responses.append(filtered_response)
            print("Comment: "+comment)
            print("\n")
            print("Vehicle components："+filtered_response)
            print("="*80)
        else:
            responses.append(filt_response)
            print("Comment: "+comment)
            print("\n")
            print("Vehicle components："+filt_response)
            print("="*80)
    else:
        responses.append(response)
        print("Comment: "+comment)
        print("\n")
        print("Vehicle components："+response)
        print("="*80)


In [None]:
idx = range(1,len(responses)+1)
# save output
import pandas as pd
output = pd.DataFrame({
    'idx': list(idx),
    'comment': comments,
    'Component': responses,})
output


In [13]:
output.to_excel('output.xlsx', index=False)

In [None]:
import pandas as pd

# Load the uploaded file without specifying usecols initially
file_path = 'Summary and subdivision of components in three major fields.xlsx'
df = pd.read_excel(file_path)

# Manually select the relevant columns based on the displayed column names
selected_columns = ['Unified Name', 'Vehicle Components', 'Three Major Fields']

# Drop rows with all NaN values in the selected columns
df_selected = df[selected_columns].dropna(how='all')

df_selected
