In [1]:
import pandas as pd

# define constant
SENTENCES_PATH = "../data/processed/100测试语音/data.csv"
KEYWORDS_PATH = "../data/raw/Keyword list of NLP.xlsx"

# load data
sentences = pd.read_csv(SENTENCES_PATH)[["sentence"]]
keywords = pd.read_excel(KEYWORDS_PATH)

# check data
keywords["Category"].value_counts()

Category
SERIES      408
TYPE        273
BRAND        92
MATERIAL     42
NICKNAME     42
LINES        19
SOCIAL        3
Name: count, dtype: int64

In [2]:
keywords.loc[keywords["Category"] == "SOCIAL"]

Unnamed: 0,Category,stdkw
603,SOCIAL,保值
604,SOCIAL,爆款
605,SOCIAL,流行款


In [3]:
def sample_sentence(data: pd.DataFrame, n: int) -> list[dict]:
    """
    Sample n records from sentence data and return as a list of dictionaries.
    :param data: a dataframe
    :param n: the number of returned samples
    :return: a list of dictionaries
    """
    return data.sample(n).to_dict(orient="records")


def sample_keyword(data: pd.DataFrame, n: int, stratify: bool) -> list[dict]:
    """
    Sample n records from keywords data and return as a list of dictionaries.
    :param data: keywords dataframe
    :param n: number of samples if stratify is False, number of samples per category if stratify is True (except SOCIAL category, which is returned as a whole)
    :param stratify: indicate whether to stratify the samples by category
    :return: a list of dictionaries 
    """
    if not stratify:
        return data.sample(n).to_dict(orient="records")
    else:
        # stratify by category
        # store the result
        # handle the case where the category is SOCIAL
        r = [*data.loc[data["Category"] == "SOCIAL"].to_dict(orient="records")]

        # other categories
        stratified_samples = data[data["Category"] != "SOCIAL"].groupby("Category", group_keys=True).apply(
            lambda x: x.sample(n), include_groups=False)
        stratified_samples.index = stratified_samples.index.droplevel(1)
        stratified_samples.reset_index(drop=False, inplace=True)
        r.extend(stratified_samples.to_dict(orient="records"))

        return r

In [4]:
# examples
sample_sentence(sentences, 3)

[{'sentence': '不过买过nano speedy有整理需求，喜欢老花，可推荐皮质包包。'},
 {'sentence': '新疆人喜欢皮质款，喜欢黑色包，喜欢大包。'},
 {'sentence': '喜欢黑色小包，喜欢逸时这瓶香水。'}]

In [5]:
sample_keyword(keywords, 1, stratify=True)

[{'Category': 'SOCIAL', 'stdkw': '保值'},
 {'Category': 'SOCIAL', 'stdkw': '爆款'},
 {'Category': 'SOCIAL', 'stdkw': '流行款'},
 {'Category': 'BRAND', 'stdkw': 'off white'},
 {'Category': 'LINES', 'stdkw': '荔枝纹'},
 {'Category': 'MATERIAL', 'stdkw': '布艺'},
 {'Category': 'NICKNAME', 'stdkw': '月光女神'},
 {'Category': 'SERIES', 'stdkw': 'sunny'},
 {'Category': 'TYPE', 'stdkw': '裙装'}]

In [6]:
# connect to DeepSeek model
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

model = ChatOpenAI(
    model='deepseek-chat',
    openai_api_key='sk-e9a452d4146a4d7286504bba5e8d7cc9',
    openai_api_base='https://api.deepseek.com',
    max_tokens=1024
)

parser = StrOutputParser()

In [7]:
def list2prompt(l: list[dict], data_type: str, category: str = "") -> str:
    """
    Convert a list of dictionaries to a string prompt.
    :param l: a list of dictionaries
    :param data_type: can be either "sentence" or "keyword"
    :param category: used when data_type is "keyword"
    :return: a string prompt
    """
    if data_type == "sentence":
        return "\n".join(d["sentence"] for d in l)
    elif data_type == "keyword":
        return ",".join(d["stdkw"] for d in l if d["Category"] == category.upper())
    else:
        raise NotImplementedError(f"Data type {data_type} is not supported.")

In [54]:
# define prompt template
system_prompt = """现在我有一些用于培训的示范用语，这些语言简单地对客户进行描述。请参考以下这些语言示范，并考虑现有的产品信息，深呼吸，思考一下，你能举出其他的例子吗？这些例子应该尽可能贴近原有示范，产品相关信息请严格按照我们提供的产品列表。"""

user_prompt = """以下是供你参考的产品信息（属性）列表：

社交属性：{SOCIAL}
品牌：{BRAND}
纹路：{LINES}
材料：{MATERIAL}
别名：{NICKNAME}
系列：{SERIES}
类型：{TYPE}

以下是一些示例：
{SENTENCE}

请基于例子中的模板，生成{n_sentences}个新的句子。内容和风格都应该基于示例。
"""

# user_prompt = """以下是供你参考的产品信息（属性）列表：
# 
# 社交属性：{SOCIAL}
# 品牌：{BRAND}
# 纹路：{LINES}
# 材料：{MATERIAL}
# 昵称：{NICKNAME}
# 系列：{SERIES}
# 类型：{TYPE}
# 
# 请帮助我模仿下面的句子的语言风格，替换掉相关的产品属性。句子长度没有固定要求，但要保持自然流畅，可以仅使用产品的部分属性。以下是一些示例：
# {SENTENCE}
# 请继续根据这些示例，生成{n_sentences}个新的句子。
# """

prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", user_prompt)
    ]
)

# a = sample_sentence(sentences, 4)
# b = sample_keyword(keywords, 2, stratify=True)

# input_prompt = prompt_template.invoke({
#     "SOCIAL": list2prompt(b, "keyword", "SOCIAL"),
#     "BRAND": list2prompt(b, "keyword", "BRAND"),
#     "LINES": list2prompt(b, "keyword", "LINES"),
#     "MATERIAL": list2prompt(b, "keyword", "MATERIAL"),
#     "NICKNAME": list2prompt(b, "keyword", "NICKNAME"),
#     "SERIES": list2prompt(b, "keyword", "SERIES"),
#     "TYPE": list2prompt(b, "keyword", "TYPE"),
#     "SENTENCE": list2prompt(sample_sentence(sentences, 3), "sentence"),
#     "n_sentences": "1"
# })

In [55]:
def chain_pipeline(llm_prompt_template, llm_model, llm_parser, debug: bool = False) -> str:
    """
    Use langchain to build a chain of models.
    :param llm_prompt_template: input prompt
    :param llm_model: model for generating text
    :param llm_parser: parser for parsing the output
    :param debug: print debug information
    :return: model output in string format
    """
    # generate samples as model prompt
    s = sample_sentence(sentences, 8)
    k = sample_keyword(keywords, 8, stratify=True)
    
    if debug:
        print(f"example sentences: {s}")
        print(f"example keywords: {k}")
        
    chain = llm_prompt_template | llm_model | llm_parser
    r = chain.invoke({
        "SOCIAL": list2prompt(k, "keyword", "SOCIAL"),
        "BRAND": list2prompt(k, "keyword", "BRAND"),
        "LINES": list2prompt(k, "keyword", "LINES"),
        "MATERIAL": list2prompt(k, "keyword", "MATERIAL"),
        "NICKNAME": list2prompt(k, "keyword", "NICKNAME"),
        "SERIES": list2prompt(k, "keyword", "SERIES"),
        "TYPE": list2prompt(k, "keyword", "TYPE"),
        "SENTENCE": list2prompt(s, "sentence"),
        "n_sentences": "1"
    })
    return r

In [64]:
chain_pipeline(prompt_template, model, parser, debug=True)

example sentences: [{'sentence': '买过capucines喜欢，喜欢color blossom项链，买过acc，喜欢路易威登，买过gucci买过coach，nano speedy到货提醒，自用赠礼的需求。'}, {'sentence': '常州人孕妈，喜欢新款，喜欢爆款，喜欢大象灰，需要发票，在意价格。'}, {'sentence': '无锡人喜欢逛小红书，喜欢经典，老花喜欢实用，喜欢托特喜欢speedy20。'}, {'sentence': '客人45岁左右，年轻时尚，买了speedy20喜欢灰色，喜欢牛皮，给女儿买了情人节限定手链喜欢acc，喜欢耳钉。'}, {'sentence': '客人长住江苏自用买过爱马仕，喜欢嗯monogram想看男鞋腰带。'}, {'sentence': '喜欢经典款，喜欢休闲，喜欢法棍，喜欢老花，喜欢经典款，喜欢channel。'}, {'sentence': '昭通人喜欢老花产品，买过side trunk, 有赠礼需求，喜欢男鞋喜欢香水。'}, {'sentence': '05后身高1米65，苗条签售喜欢on the go小号喜欢老花喜欢cannes'}]
example keywords: [{'Category': 'SOCIAL', 'stdkw': '保值'}, {'Category': 'SOCIAL', 'stdkw': '爆款'}, {'Category': 'SOCIAL', 'stdkw': '流行款'}, {'Category': 'BRAND', 'stdkw': 'dj'}, {'Category': 'BRAND', 'stdkw': 'roger vivier'}, {'Category': 'BRAND', 'stdkw': '兰蔻'}, {'Category': 'BRAND', 'stdkw': 'max mara'}, {'Category': 'BRAND', 'stdkw': '万国'}, {'Category': 'BRAND', 'stdkw': 'rolex'}, {'Category': 'BRAND', 'stdkw': '杜嘉班纳'}, {'Category': 'BRAND', 'stdkw': '积家'}, {'Category': 'LINE

'南京客人，30岁职场女性，钟爱Roger Vivier经典方扣鞋，追求时尚与舒适并重，近期关注Max Mara的羊毛大衣，偏爱千禧年缎面纹路，有自用和送礼双重需求，特别注重产品的保值性。'

In [37]:
# generate 100 test sample
import joblib

r = [chain_pipeline(prompt_template, model, parser, debug=False) for _ in range(100)]

joblib.dump(r, "deepseek_100.pkl")

KeyboardInterrupt: 

In [None]:
# deprecated code snippet
# a = keywords.groupby("Category", group_keys=True).apply(lambda x: x.sample(2), include_groups=False) # .to_dict(orient="records")
# a.index = a.index.droplevel(1)
# a.reset_index(drop=False, inplace=True)
# a.to_dict(orient="records")

In [19]:
pd.DataFrame({"generate": r})

Unnamed: 0,generate
0,这位客人穿着简约时尚，对品牌如off white和诺悠翩雅有独到见解，偏好流行款和保值单品，...
1,这位客户，品味独特，钟情于纪梵希的设计，偏爱其标志性的棋盘格纹路，尤其对那款以鳄鱼皮制成、昵...
2,都市精英男士，注重品质与时尚，偏爱乔治阿玛尼的鳄鱼皮风琴包，追求保值与流行款，展现其独特的社...
3,拥有过Classic Flap，钟情于保值，偏爱小众设计师品牌，追求独特，喜欢蕾丝纹路，注重...
4,客人偏好Fendi，对保值的社交属性感兴趣，尤其是带有马赛克纹路的爆款，如诺悠翩雅的软箱系列...
...,...
95,喜欢夜光纹路的江诗丹顿手表，搭配18k金材质，彰显低调奢华。
96,这位顾客偏好经典与时尚并重的款式，她钟情于梵克雅宝的精致工艺，特别青睐那款采用鳄鱼皮制作的月...
97,喜欢棋盘格，喜欢简约风，喜欢羊皮材质，喜欢马蹄包，喜欢低调奢华。
98,一位30岁的时尚博主，偏爱独特设计，对纪梵希的棋盘格系列情有独钟，尤其钟情于其羊绒材质的短袖...
