首先，提示词里，先判断是不是研究性的文章还是综述性的文章，如果是研究性的文章，那么判断是不是钠电正极的文章，如果是的，再判断是不是掺杂改性的，如果是的，再判断用的是什么元素改性的。以及是什么正极。最后将所有的这些东西保存至excel中。

In [29]:
import ollama  
import csv
import pandas as pd
from tqdm import tqdm  # 导入进度条库

# 提取和解析结果
def parse_analysis_result(result, is_research, is_sodium):
    print(f"模型返回: {result}")  # 调试：输出模型原始返回内容
    dopant_elements = extract_after_keyword(result, "Dopant elements used:") or "未指定"  # 提取掺杂元素
    is_doped = "未指定" not in dopant_elements  # 判断是否为掺杂文章

    parsed_data = {
        "Dopant_Elements": dopant_elements,  # 掺杂元素
        "Cathode_Material": extract_after_keyword(result, "Cathode material used:") or "未指定",  # 正极材料
        "Model_Response": result  # 保留模型的原始输出
    }
    
    # 根据文章类型判断，返回不同的信息
    if not is_research:
        return {**parsed_data, "Article_Type": "综述性文章", "Is_Sodium": "否", "Is_Doped": "否"}
    if not is_sodium:
        return {**parsed_data, "Article_Type": "非钠电正极文章", "Is_Sodium": "否", "Is_Doped": "未知"}
    return {**parsed_data, 
            "Article_Type": "研究性文章", 
            "Is_Sodium": "是", 
            "Is_Doped": "是" if is_doped else "否"}

# 直接查找关键词并返回其后的内容
def extract_after_keyword(text, keyword):
    try:
        start_index = text.lower().find(keyword.lower())  # 忽略大小写查找关键词
        if start_index != -1:
            return text[start_index + len(keyword):].split('\n')[0].strip()  # 提取关键词后的内容
        return "未知"  # 如果找不到关键词，返回“未知”
    except Exception as e:
        print(f"提取 {keyword} 后出错: {e}")
        return "未知"

# 调用 Ollama 模型进行分析
def analyze_abstract_with_ollama(prompt):
    response = ollama.chat(
        model="llama3.1:latest",
        stream=False,
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": 0}
    )
    if "message" in response and "content" in response["message"]:
        return response["message"]["content"]  # 返回有效内容
    else:
        return "未返回有效内容"

# 生成提示词
def generate_prompt(abstract):
    prompt = f"""
    You are a research assistant specialized in analyzing scientific papers about sodium-ion battery cathodes and elemental doping strategies.
    Your task is to extract the following precise information from the research article abstract provided below:

    1. Article Type: Determine if this is a research article or a review article.
    2. If it is a research article, determine if it discusses sodium-ion battery cathodes.
    3. If it discusses sodium-ion battery cathodes, identify the dopant elements used. List only the elements used for doping (e.g., Al, Cu).
    4. Also, identify the cathode material used in the study.

    Provide only the information requested, nothing more, and make the response concise.

    Abstract: {abstract}
    """
    return prompt

# 处理文献
def process_documents(documents):
    results = []
    for doc in tqdm(documents, desc="处理文献", unit="篇"):  # 显示进度条
        prompt = generate_prompt(doc["abstract"])  # 生成提示词
        analysis_result = analyze_abstract_with_ollama(prompt)  # 调用模型分析
        
        # 解析返回的结果并判断类型
        is_research = "research article" in analysis_result.lower()  # 判断是否为研究性文章
        is_sodium = "sodium" in analysis_result.lower()  # 判断是否涉及钠
        parsed_result = parse_analysis_result(analysis_result, is_research, is_sodium)  # 解析结果
        
        results.append({
            "标题": doc["title"],  # 添加标题
            "摘要": doc["abstract"],  # 添加摘要
            "掺杂元素": parsed_result["Dopant_Elements"],  # 添加掺杂元素
            "正极材料": parsed_result["Cathode_Material"],  # 添加正极材料
            "文章类型": parsed_result["Article_Type"],  # 添加文章类型
            "是否是钠电正极材料": parsed_result["Is_Sodium"],  # 添加是否是钠电正极材料
            "是否使用了元素掺杂": parsed_result["Is_Doped"],  # 添加是否使用了元素掺杂
            "模型响应": parsed_result["Model_Response"]  # 添加模型响应
        })
    return results

# 保存结果到 CSV 文件
def save_results_to_csv(results, output_file):
    df = pd.DataFrame(results)  # 将结果转换为 DataFrame
    df.to_csv(output_file, index=False, encoding='utf-8-sig')  # 保存为 CSV 文件，支持中文字符

# 读取文献文件（假设文献格式为 "标题\t摘要"）
def read_documents(file_path):
    documents = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')  # 使用制表符分隔
        for row in reader:
            title, abstract = row  # 获取标题和摘要
            documents.append({"title": title, "abstract": abstract})  # 添加到文献列表
    return documents

# 主流程
if __name__ == "__main__":
    documents = read_documents('Sodium_cathode_doping_Layered_oxide.txt')  # 读取文献
    results = process_documents(documents)  # 处理文献
    save_results_to_csv(results, 'Sodium_cathode_doping_Layered_oxide_ollama文献分析结果.csv')  # 保存结果到 CSV 文件

    print("文献分析已完成，结果已保存到 ollama文献分析结果.csv")  # 输出完成信息


模型返回: I'm ready to assist you. Please provide the research article abstract. I'll extract the precise information as per your request.


处理文献:  99%|█████████▉| 723/729 [20:50<00:06,  1.06s/篇]

模型返回: I'm ready to assist you. Please provide the research article abstract. I'll extract the precise information as per your request.


处理文献:  99%|█████████▉| 724/729 [20:51<00:05,  1.19s/篇]

模型返回: 1. Article Type: Research article
2. Discusses sodium-ion battery cathodes: No
3. Dopant elements used: None mentioned
4. Cathode material used in the study: CoFeP (CoFe phosphide)


处理文献:  99%|█████████▉| 725/729 [20:53<00:05,  1.40s/篇]

模型返回: 1. Article Type: Review article
2. No, it does not discuss sodium-ion battery cathodes (it discusses lithium-ion batteries)
3. None (no dopant elements mentioned)
4. LiNi(1/3)Mn(1/3)Co(1/3)O(2)


处理文献: 100%|█████████▉| 726/729 [20:55<00:04,  1.43s/篇]

模型返回: Here is the extracted information:

1. Article Type: Research article
2. Sodium-ion battery cathodes: No (discusses lithium-ion batteries)
3. Dopant elements: None mentioned
4. Cathode material: Not discussed (article discusses anode materials)


处理文献: 100%|█████████▉| 727/729 [20:55<00:02,  1.19s/篇]

模型返回: I'm ready to assist you. Please provide the research article abstract. I'll extract the precise information as per your request.


处理文献: 100%|█████████▉| 728/729 [20:57<00:01,  1.24s/篇]

模型返回: 1. Article Type: Research article
2. Discusses sodium-ion battery cathodes: No
3. Dopant elements: None mentioned
4. Cathode material: Na2Zn2TeO6 (NZTO)


处理文献: 100%|██████████| 729/729 [20:59<00:00,  1.73s/篇]

模型返回: Here is the extracted information:

1. Article Type: Review article
2. Research article (if applicable): No
3. Dopant elements used: N, P
4. Cathode material used in the study: Phosphorene-graphene hybrid material
文献分析已完成，结果已保存到 ollama文献分析结果.csv





In [31]:
import pandas as pd
import re
from tkinter import Tk
from tkinter.filedialog import askopenfilename

# 读取CSV文件
def select_file():
    Tk().withdraw()  # 隐藏主窗口
    file_path = askopenfilename(title="选择CSV文件", filetypes=[("CSV Files", "*.csv")])  # 弹出文件选择窗口
    return file_path

# 使用正则表达式提取元素符号，忽略括号中的内容
def extract_element_symbols(text):
    # 使用正则表达式提取大写字母开头，跟着小写字母的元素符号，如Al, Cr, Fe等
    return re.findall(r'\b[A-Z][a-z]?\b', text)

# 主流程
if __name__ == "__main__":
    # 弹出对话框让用户选择文件
    file_path = select_file()
    
    if file_path:  # 如果用户选择了文件
        df = pd.read_csv(file_path)

        # 将掺杂元素列应用正则表达式进行提取
        df['提取的掺杂元素'] = df['掺杂元素'].apply(extract_element_symbols)

        # 将所有元素展平成单一列表
        all_elements = [elem for sublist in df['提取的掺杂元素'] for elem in sublist]

        # 统计每个元素出现的频率
        element_frequency = pd.Series(all_elements).value_counts()

        # 保存到CSV文件
        output_file = '正则化提取的掺杂元素频率统计.csv'
        element_frequency.to_csv(output_file, header=['Frequency'])

        print(f"正则化提取的掺杂元素频率已保存到文件: {output_file}")
    else:
        print("未选择文件，程序终止。")


归一化且拆分后的掺杂元素频率已保存到文件: Sodium_cathode_doping_Layered_oxide_ollama文献分析结果_归一化且拆分后的掺杂元素频率统计.csv


In [32]:
import pandas as pd
import re

# 读取CSV文件
file_path = 'Sodium_cathode_doping_Layered_oxide_ollama文献分析结果.csv'
df = pd.read_csv(file_path)

# 使用正则表达式提取元素符号，忽略括号中的内容
def extract_element_symbols(text):
    # 使用正则表达式提取大写字母开头，跟着小写字母的元素符号，如Al, Cr, Fe等
    return re.findall(r'\b[A-Z][a-z]?\b', text)

# 将掺杂元素列应用正则表达式进行提取
df['提取的掺杂元素'] = df['掺杂元素'].apply(extract_element_symbols)

# 将所有元素展平成单一列表
all_elements = [elem for sublist in df['提取的掺杂元素'] for elem in sublist]

# 统计每个元素出现的频率
element_frequency = pd.Series(all_elements).value_counts()

# 保存到CSV文件
output_file = '正则化提取的掺杂元素频率统计.csv'
element_frequency.to_csv(output_file, header=['Frequency'])

print(f"正则化提取的掺杂元素频率已保存到文件: {output_file}")


正则化提取的掺杂元素频率已保存到文件: 正则化提取的掺杂元素频率统计.csv


In [33]:
import pandas as pd
import re

# 读取CSV文件
file_path = 'Sodium_cathode_doping_Layered_oxide_ollama文献分析结果.csv'
df = pd.read_csv(file_path)

# 定义一个函数，从"模型响应"列的"3."之后提取元素
def extract_elements_from_model(text):
    # 使用正则表达式找到"3."后面的部分，直到出现"4."为止
    match = re.search(r'3\.\s*([^4]+)', text)
    if match:
        # 使用正则表达式提取元素符号（如Al、Cr等）
        return re.findall(r'\b[A-Z][a-z]?\b', match.group(1))
    return []

# 对"模型响应"列应用该函数来提取元素
df['提取的元素'] = df['模型响应'].apply(extract_elements_from_model)

# 将所有提取的元素展平成单一列表
all_elements_model = [elem for sublist in df['提取的元素'] for elem in sublist]

# 统计每个元素的频率
element_frequency_model = pd.Series(all_elements_model).value_counts()

# 保存结果到CSV文件
output_file = '提取的元素频率统计.csv'
element_frequency_model.to_csv(output_file, header=['Frequency'])

print(f"提取的元素频率已保存到文件: {output_file}")


提取的元素频率已保存到文件: 提取的元素频率统计.csv


In [None]:
import ollama
import csv
import pandas as pd

# 简化的解析函数，基于明确的关键词提取结果
def parse_analysis_result(result):
    print(f"Model Response: {result}")  # 调试：输出模型原始返回内容
    parsed_data = {
        "Dopant_Elements": extract_after_keyword(result, "Dopant Elements:"),
        "Cathode_Material": extract_after_keyword(result, "Cathode Material:"),
        "Doping_Strategy": extract_after_keyword(result, "Doping Strategy:"),
        "Performance_Improvement": extract_after_keyword(result, "Performance Improvement:"),
        "Model_Response": result  # 保留模型的原始输出
    }
    return parsed_data

# 直接查找关键词并返回其后的内容
def extract_after_keyword(text, keyword):
    try:
        # 查找关键词并返回其后面的内容
        start_index = text.lower().find(keyword.lower())  # 忽略大小写
        if start_index != -1:
            # 从关键词之后开始截取
            return text[start_index + len(keyword):].split('\n')[0].strip()
        return "Unknown"
    except Exception as e:
        print(f"Error extracting after {keyword}: {e}")
        return "Unknown"

# Step 2: 调用 Ollama 本地模型
def analyze_abstract_with_ollama(prompt):
    response = ollama.chat(
        model="llama3.1:latest",  # 替换为你下载的模型名称
        stream=False,
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": 0}
    )

    # 从 response 中提取 'message' 和 'content' 字段
    if "message" in response and "content" in response["message"]:
        return response["message"]["content"]
    else:
        return "No valid content returned"

# Step 3: 生成提示词，确保只返回所需信息
def generate_prompt(abstract):
    prompt = f"""
    You are a research assistant specialized in analyzing scientific papers about sodium-ion battery cathodes and elemental doping strategies.
    Your task is to extract the following precise information from the research article abstract provided below:
    
    1. Dopant Elements: Only list the elements that were used as dopants, without additional explanation.
    2. Cathode Material: Only list the sodium cathode material used in the study.
    3. Doping Strategy: Briefly describe the doping method (e.g., solid-state reaction, etc.).
    4. Performance Improvement: Summarize the performance improvement in one line if applicable.
    
    Provide only the information requested, nothing more, and make the response concise.
    
    Abstract: {abstract}
    """
    return prompt

# Step 4: 批量处理文献
def process_documents(documents):
    results = []
    for doc in documents:
        prompt = generate_prompt(doc["abstract"])
        analysis_result = analyze_abstract_with_ollama(prompt)
        
        # 解析返回的结果
        parsed_result = parse_analysis_result(analysis_result)
        
        # 将每篇文献的标题、摘要和解析结果存储到结果列表
        results.append({
            "Title": doc["title"],
            "Abstract": doc["abstract"],
            "Dopant_Elements": parsed_result["Dopant_Elements"],
            "Cathode_Material": parsed_result["Cathode_Material"],
            "Doping_Strategy": parsed_result["Doping_Strategy"],
            "Performance_Improvement": parsed_result["Performance_Improvement"],
            "Model_Response": parsed_result["Model_Response"]  # 新增模型回答列
        })
    return results

# Step 5: 保存结果到 CSV 文件
def save_results_to_csv(results, output_file):
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)

# 读取文献文件 (假设文献格式为 "标题\t摘要")
def read_documents(file_path):
    documents = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            title, abstract = row
            documents.append({"title": title, "abstract": abstract})
    return documents

# 主流程
if __name__ == "__main__":
    # 假设文献存储在 'documents.txt' 中，文献格式为 "标题\t摘要"
    documents = read_documents('SIB.txt')
    
    # 处理文献
    results = process_documents(documents)
    
    # 将处理结果保存到 CSV
    save_results_to_csv(results, 'ollama_literature_analysis_with_model_response.csv')

    print("文献分析已完成，结果已保存到 ollama_literature_analysis_with_model_response.csv")


In [None]:
import ollama
import csv
import pandas as pd

# 解析 Ollama 的返回内容，提取每个问题的答案
def parse_analysis_result(result):
    parsed_data = {
        "HEAs": "Unknown",
        "HER": "Unknown",
        "Catalysis": "Unknown",
        "Article_Type": "Unknown"
    }

    # 查找每个问题的答案并更新相应字段
    if "High Entropy Materials" in result:
        parsed_data["HEAs"] = "Yes" if "Yes" in result or "Involved: Yes" in result else "No"
    
    if "Hydrogen Evolution Reaction" in result:
        parsed_data["HER"] = "Yes" if "Yes" in result or "Involved: Yes" in result else "No"
    
    if "Catalysis research" in result:
        parsed_data["Catalysis"] = "Yes" if "Yes" in result or "Involved: Yes" in result else "No"

    if "Type of Article" in result:
        if "Research Article" in result:
            parsed_data["Article_Type"] = "Research Article"
        elif "Review Article" in result:
            parsed_data["Article_Type"] = "Review Article"

    return parsed_data

# Step 2: 调用 Ollama 本地模型
def analyze_abstract_with_ollama(prompt):
    response = ollama.chat(
        model="llama3.1:latest",  # 替换为你下载的模型名称，如 llama3.1:latest
        stream=False,
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": 0}
    )

    # 从 response 中提取 'message' 和 'content' 字段
    if "message" in response and "content" in response["message"]:
        return response["message"]["content"]
    else:
        return "No valid content returned"

# Step 3: 生成提示词
def generate_prompt(abstract):
    prompt = f"""
    You are a research chemist specialized in High Entropy Materials (HEAs), Hydrogen Evolution Reactions (HER), and catalysis research.
    Please analyze the following research article abstract and provide structured information about whether it involves:
    1. High Entropy Materials (HEAs)
    2. Hydrogen Evolution Reaction (HER)
    3. Catalysis research
    Additionally, please specify if it is a research article or a review article.

    Abstract: {abstract}
    """
    return prompt
# Step 4: 批量处理文献
def process_documents(documents):
    results = []
    for doc in documents:
        prompt = generate_prompt(doc["abstract"])
        analysis_result = analyze_abstract_with_ollama(prompt)
        
        # 解析返回的结果
        parsed_result = parse_analysis_result(analysis_result)
        
        # 将每篇文献的标题、摘要和解析结果存储到结果列表
        results.append({
            "title": doc["title"],
            "abstract": doc["abstract"],
            "HEAs": parsed_result["HEAs"],
            "HER": parsed_result["HER"],
            "Catalysis": parsed_result["Catalysis"],
            "Article_Type": parsed_result["Article_Type"]
        })
    return results

# Step 5: 保存结果到 CSV 文件
def save_results_to_csv(results, output_file):
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)

# 主流程
if __name__ == "__main__":
    # 假设文献存储在 'documents.txt' 中，文献格式为 "标题\t摘要"
    documents = read_documents('documents.txt')
    
    # 处理文献
    results = process_documents(documents)
    
    # 将处理结果保存到 CSV
    save_results_to_csv(results, 'ollama_literature_analysis.csv')

    print("文献分析已完成，结果已保存到 ollama_literature_analysis.csv")
