In [227]:
from pyhanlp import *

# Keywords for ships and locations
ship_keywords = ["号", "轮", "船", "艘","舰", "军舰", "号船", "号轮", "船只", "舰艇", "船舶", "货轮", "客轮", "油轮", "潜艇", "驳船",
                 "渡船", "航母", "货船", "运输船", "航空母舰", "战舰", "核潜艇"]

location_keywords = ["港湾", "港口", "海域", "海滩", "海湾", "海岸线", "海峡", "海底", "岸边", "河口", "岛屿", "航线", "码头",
                     "航道", "海港", "水域", "水道", "河流", "湾区", "泊位", "湖泊", "湖区", "江口", "湖边", "沿海", "沿岸", "海路",
                     "领海","半岛"]

# Country mapping dictionary
country_mapping = {
    "中国": "中国", "美": "美国", "美国": "美国", "日本": "日本",  "韩国": "韩国", "韩": "韩国",
    "英国": "英国", "英": "英国", "加拿大": "加拿大", "德国": "德国", "法国": "法国", "印度": "印度", "巴西": "巴西",
    "墨西哥": "墨西哥", "俄罗斯": "俄罗斯", "俄": "俄罗斯", "意大利": "意大利", "西班牙": "西班牙", "荷兰": "荷兰",
    "瑞典": "瑞典", "挪威": "挪威", "丹麦": "丹麦", "芬兰": "芬兰", "爱尔兰": "爱尔兰", "奥地利": "奥地利", "瑞士": "瑞士",
    "新加坡": "新加坡", "马来西亚": "马来西亚", "印度尼西亚": "印度尼西亚", "泰国": "泰国", "越南": "越南", "菲律宾": "菲律宾",
    "南非": "南非", "埃及": "埃及", "肯尼亚": "肯尼亚"
}



In [235]:
import os

folder_name = input("Enter your foldername: ")

folder_path = "/Users/hutusheng/Desktop/database/"  + folder_name

report_texts = []

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    # Check if the path is a file
    if os.path.isfile(file_path):
        with open(file_path, "r") as file:
            report_text = file.read()
            report_text = report_text.replace("账号","")
            report_texts.append(report_text)

In [240]:
import re
from pyhanlp import HanLP

def extract_named_entities(text):
    segment = HanLP.newSegment().enableNameRecognize(True)
    term_list = segment.seg(text)
    named_entities = []
    for term in term_list:
        if term.nature.toString() == "nr"or "nt":
            named_entities.append(term.word)
    return named_entities

def extract_name(named_entities, ship_keywords, country_mapping):
    # List of predefined ship names
    predefined_ship_names = [
        "核潜艇", "罗纳德·里根","里根","斯坦塞姆", "尼米兹", "卡尔·文森", "罗斯福", "亚伯拉罕·林肯", "乔治·华盛顿", "斯坦尼斯", "蓝岭", "安提坦", "米利厄斯",
        "夏伊洛", "柯蒂斯·威尔伯", "费兹杰罗", "史塔森", "拉森", "麦坎贝尔", "马斯廷", "丹佛", "托图加", "日耳曼城", "复仇者",
        "防卫者", "战士", "爱国者", "守卫号", "米利诺基特", "福尔河", "布伦斯维克", "胜利", "能力", "效率", "忠诚", "无暇",
        "鲍迪奇", "汉森", "USNS MARYSEARS", "USNS Howard", "USNS GUAM", "蒙哥马利", "嘉贝丽.吉弗斯", "切斯劳维尔","考彭斯"
    ]

    # Check if any of the predefined ship names are in the named_entities list
    for ship_name in predefined_ship_names:
        if ship_name in named_entities:
            return ship_name

    # Continue with the original implementation
    for i in range(len(named_entities)):
        entity = named_entities[i]
        for keyword in ship_keywords:
            if keyword in entity:
                if i > 0:
                    previous_entity = named_entities[i - 1]
                    ship_name = previous_entity
                else:
                    ship_name = entity

                valid_ship_name = all(keyword not in ship_name for keyword in ship_keywords) and \
                                  all(ship_name not in country_mapping_keyword for country_mapping_keyword in country_mapping)

                if valid_ship_name:
                    return ship_name

    return ""



def extract_location(named_entities, location_keywords):
    entity_freq = {}  # Dictionary to store the frequency of each entity

    for i in range(len(named_entities)):
        entity = named_entities[i]
        for keyword in location_keywords:
            if keyword in entity:
                if i > 0 and keyword == entity:
                    previous_entity = named_entities[i - 1]
                    location_entity = previous_entity + entity
                else:
                    location_entity = entity

                # Count the frequency of the location entity
                if location_entity in entity_freq:
                    entity_freq[location_entity] += 1
                else:
                    entity_freq[location_entity] = 1

    if entity_freq:
        # Find the entity with the highest frequency
        max_freq_entity = max(entity_freq, key=entity_freq.get)
        return max_freq_entity

    return ""


def extract_nationality(named_entities, ship_name,country_mapping):
    closest_entity = None
    min_distance = float('inf')

    for entity in named_entities:
        if entity and ship_name:
            for keyword in country_mapping.keys():
                if entity == keyword:
                    distance = abs(named_entities.index(ship_name) - named_entities.index(entity))
                    if distance < min_distance:
                        min_distance = distance
                        closest_entity = entity

    return closest_entity if closest_entity else ""

def extract_date(text):
    pattern = r"(\d{4})-(\d{2})-(\d{2})"
    pattern2 = r"(\d{4})年(\d{2})月(\d{2})日"
    pattern3 = r"((\d{2})月(\d{2})日)"  # Corrected the closing parenthesis here
    match = re.search(pattern, text)
    match2 = re.search(pattern2, text)
    match3 = re.search(pattern3, text)
    if match:
        year = match.group(1)
        month = match.group(2)
        day = match.group(3) if match.group(3) else None
        return f"{year}-{month.zfill(2)}-{day.zfill(2)}" if day else f"{year}-{month.zfill(2)}"
    elif match2:
        year = match2.group(1)
        month = match2.group(2)
        day = match2.group(3) if match2.group(3) else None
        return f"{year}-{month.zfill(2)}-{day.zfill(2)}" if day else f"{year}-{month.zfill(2)}"
    elif match3:
        month = match3.group(1)
        day = match3.group(2) if match3.group(3) else None
        return f"{month.zfill(1)}-{day.zfill(2)}" if day else f"{month.zfill(1)}"
    return None



import re

def extract_sentences(report_texts,ship_keywords,location_keywords,country_mapping):
    result_dict_list = []

    for m, report_text in enumerate(report_texts):
        # Extract named entities
        named_entities = extract_named_entities(report_text)
        named_entities = [entity for entity in named_entities if entity.isalpha()]
        named_entities = [entity for entity in named_entities if entity and isinstance(entity, str)]


        # Extract ship name
        date = extract_date(report_text)

        pattern = r"(\d{4})-(\d{2})-(\d{2})"
        pattern2 = r"(\d{4})年(\d{2})月(\d{2})日"
        report_text = re.sub(pattern, '', report_text)
        report_text = re.sub(pattern2, '', report_text)
        # Extract location
        location = extract_location(named_entities,location_keywords)

        # Extract nationality
        ship_name = extract_name(named_entities,ship_keywords,country_mapping)
        nationality = extract_nationality(named_entities, ship_name,country_mapping)
        
        # Extract date

        # Split report text into sentences
        sentences = re.split(r'[.,\s]+', report_text)

        ship_sentence = ""
        location_sentence = ""
        nationality_sentence = ""
        date_sentence = ""

        for sentence in sentences:
            if ship_name and ship_name in sentence and not ship_sentence:
                ship_sentence = sentence
            if location and location in sentence and not location_sentence:
                location_sentence = sentence
            if nationality and nationality in sentence and not nationality_sentence:
                nationality_sentence = sentence
            if date is not None and date in sentence and not date_sentence:
                date_sentence = sentence

            if ship_sentence and location_sentence and nationality_sentence and date_sentence:
                break

        result_dict = {
            "Ship Name": ship_name,
            "Location": location,
            "Nationality": nationality,
            "Date": date,
            "File Name": f"File number {m}",
            "Ship Sentence": ship_sentence,
            "Location Sentence": location_sentence,
            "Nationality Sentence": nationality_sentence,
            "Date Sentence": date_sentence
        }

        result_dict_list.append(result_dict)

    return result_dict_list







In [241]:
output = extract_sentences(report_texts, ship_keywords, location_keywords, country_mapping)
for dictionary in output:
    for key, value in dictionary.items():
        print(f"{key}: {value}")
    print()

Ship Name: 核潜艇
Location: 韩国港口
Nationality: 韩国
Date: 2023-07-19
File Name: File number 0
Ship Sentence: 美军战略核潜艇42年来首次靠泊韩国，意欲何为？
Location Sentence: 7月18日证实，美国海军一艘战略核潜艇，目前正在韩国釜山港停靠。这是美国战略核潜艇自1981年3月以来，首次在韩国港口停靠。
Nationality Sentence: 美军战略核潜艇42年来首次靠泊韩国，意欲何为？
Date Sentence: 

Ship Name: 核潜艇
Location: 朝鲜半岛
Nationality: 美
Date: 2023-07-21
File Name: File number 1
Ship Sentence: 美战略核潜艇入韩
Location Sentence: 央视新闻消息，据朝中社20日报道，朝鲜国防相强纯男当天发表声明，谴责美韩18日召开“核磋商小组”会议，并强调美国战略核潜艇停泊釜山港系美国40多年来首次在朝鲜半岛部署战略核武器，对朝鲜构成“最露骨而直接”的核威胁。
Nationality Sentence: 美战略核潜艇入韩
Date Sentence: 

Ship Name: 核潜艇
Location: 朝鲜半岛
Nationality: 美
Date: 2023-07-20
File Name: File number 2
Ship Sentence: 美战略核潜艇停泊釜山，朝方警告：可能符合朝鲜使用核武器条件_凤凰网
Location Sentence: 据朝中社报道，7月20日，朝鲜国防相强纯男发表谈话警告称，美国派遣俄亥俄级战略核潜艇停泊在釜山港作战基地，是40多年来第一次在朝鲜半岛地区部署的战略核武器，是对朝鲜的最为露骨而直接的核威胁，而美军战略资产部署的可见性增大，可能会符合朝鲜使用的核武器的法律条件。
Nationality Sentence: 美战略核潜艇停泊釜山，朝方警告：可能符合朝鲜使用核武器条件_凤凰网
Date Sentence: 

Ship Name: 核潜艇
Location: 海峡两岸
Nationality: 韩国
Date: 2023-07-19
File Name: File numb