In [201]:
from pyhanlp import *

# Keywords for ships and locations
ship_keywords = ["号", "轮", "船", "艘","舰", "军舰", "号船", "号轮", "船只", "舰艇", "船舶", "货轮", "客轮", "油轮", "潜艇", "驳船",
                 "渡船", "航母", "货船", "运输船", "航空母舰", "战舰", "核潜艇"]

location_keywords = ["港湾", "港口", "海域", "海滩", "海湾", "海岸线", "海峡", "海底", "岸边", "河口", "岛屿", "航线", "码头",
                     "航道", "海港", "水域", "水道", "河流", "湾区", "泊位", "湖泊", "湖区", "江口", "湖边", "沿海", "沿岸", "海路"]

# Country mapping dictionary
country_mapping = {
    "中国": "中国", "美": "美国", "美国": "美国", "日本": "日本",  "韩国": "韩国", "韩": "韩国",
    "英国": "英国", "英": "英国", "加拿大": "加拿大", "德国": "德国", "法国": "法国", "印度": "印度", "巴西": "巴西",
    "墨西哥": "墨西哥", "俄罗斯": "俄罗斯", "俄": "俄罗斯", "意大利": "意大利", "西班牙": "西班牙", "荷兰": "荷兰",
    "瑞典": "瑞典", "挪威": "挪威", "丹麦": "丹麦", "芬兰": "芬兰", "爱尔兰": "爱尔兰", "奥地利": "奥地利", "瑞士": "瑞士",
    "新加坡": "新加坡", "马来西亚": "马来西亚", "印度尼西亚": "印度尼西亚", "泰国": "泰国", "越南": "越南", "菲律宾": "菲律宾",
    "南非": "南非", "埃及": "埃及", "肯尼亚": "肯尼亚"
}


In [202]:
import os

folder_name = input("Enter your foldername: ")

folder_path = "/Users/hutusheng/Desktop/database/"  + folder_name

report_texts = []

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    # Check if the path is a file
    if os.path.isfile(file_path):
        with open(file_path, "r") as file:
            report_text = file.read()
            report_text = report_text.replace("账号","")
            report_texts.append(report_text)

In [203]:
import re
from pyhanlp import HanLP

def extract_named_entities(text):
    segment = HanLP.newSegment().enableNameRecognize(True)
    term_list = segment.seg(text)
    named_entities = []
    for term in term_list:
        if term.nature.toString() == "nr"or "nt":
            named_entities.append(term.word)
    return named_entities

def extract_name(named_entities, ship_keywords, country_mapping):
    for i in range(len(named_entities)):
        entity = named_entities[i]
        for keyword in ship_keywords:
            if keyword in entity:
                if i > 0:
                    previous_entity = named_entities[i - 1]
                    ship_name = previous_entity
                else:
                    ship_name = entity

               
                valid_ship_name = all(keyword not in ship_name for keyword in ship_keywords) and \
                                  all(ship_name not in country_mapping_keyword for country_mapping_keyword in country_mapping)

                if valid_ship_name:
                    return ship_name

    return ""


def extract_location(named_entities, location_keywords):
    entity_freq = {}  # Dictionary to store the frequency of each entity

    for i in range(len(named_entities)):
        entity = named_entities[i]
        for keyword in location_keywords:
            if keyword in entity:
                if i > 0 and keyword == entity:
                    previous_entity = named_entities[i - 1]
                    location_entity = previous_entity + entity
                else:
                    location_entity = entity

                # Count the frequency of the location entity
                if location_entity in entity_freq:
                    entity_freq[location_entity] += 1
                else:
                    entity_freq[location_entity] = 1

    if entity_freq:
        # Find the entity with the highest frequency
        max_freq_entity = max(entity_freq, key=entity_freq.get)
        return max_freq_entity

    return ""


def extract_nationality(named_entities, ship_name,country_mapping):
    closest_entity = None
    min_distance = float('inf')

    for entity in named_entities:
        if entity and ship_name:
            for keyword in country_mapping.keys():
                if entity == keyword:
                    distance = abs(named_entities.index(ship_name) - named_entities.index(entity))
                    if distance < min_distance:
                        min_distance = distance
                        closest_entity = entity

    return closest_entity if closest_entity else ""

def extract_date(text):
    pattern = r"(\d{4})-(\d{2})-(\d{2})"
    pattern2 = r"(\d{4})年(\d{2})月(\d{2})日"
    match = re.search(pattern, text)
    match2 = re.search(pattern2, text)
    if match:
        year = match.group(1)
        month = match.group(2)
        day = match.group(3) if match.group(3) else None
        return f"{year}-{month.zfill(2)}-{day.zfill(2)}" if day else f"{year}-{month.zfill(2)}"
    elif match2:
        year = match2.group(1)
        month = match2.group(2)
        day = match2.group(3) if match2.group(3) else None
        return f"{year}-{month.zfill(2)}-{day.zfill(2)}" if day else f"{year}-{month.zfill(2)}"
    return None


import re

def extract_sentences(report_texts,ship_keywords,location_keywords,country_mapping):
    result_dict_list = []

    for m, report_text in enumerate(report_texts):
        # Extract named entities
        named_entities = extract_named_entities(report_text)
        named_entities = [entity for entity in named_entities if entity.isalpha()]
        named_entities = [entity for entity in named_entities if entity and isinstance(entity, str)]


        # Extract ship name
        date = extract_date(report_text)

        pattern = r"(\d{4})-(\d{2})-(\d{2})"
        pattern2 = r"(\d{4})年(\d{2})月(\d{2})日"
        report_text = re.sub(pattern, '', report_text)
        report_text = re.sub(pattern2, '', report_text)
        # Extract location
        location = extract_location(named_entities,location_keywords)

        # Extract nationality
        ship_name = extract_name(named_entities,ship_keywords,country_mapping)
        nationality = extract_nationality(named_entities, ship_name,country_mapping)
        
        # Extract date

        # Split report text into sentences
        sentences = re.split(r'[.,\s]+', report_text)

        ship_sentence = ""
        location_sentence = ""
        nationality_sentence = ""
        date_sentence = ""

        for sentence in sentences:
            if ship_name and ship_name in sentence and not ship_sentence:
                ship_sentence = sentence
            if location and location in sentence and not location_sentence:
                location_sentence = sentence
            if nationality and nationality in sentence and not nationality_sentence:
                nationality_sentence = sentence
            if date is not None and date in sentence and not date_sentence:
                date_sentence = sentence

            if ship_sentence and location_sentence and nationality_sentence and date_sentence:
                break

        result_dict = {
            "Ship Name": ship_name,
            "Location": location,
            "Nationality": nationality,
            "Date": date,
            "File Name": f"File number {m}",
            "Ship Sentence": ship_sentence,
            "Location Sentence": location_sentence,
            "Nationality Sentence": nationality_sentence,
            "Date Sentence": date_sentence
        }

        result_dict_list.append(result_dict)

    return result_dict_list







In [204]:
output = extract_sentences(report_texts, ship_keywords, location_keywords, country_mapping)
for dictionary in output:
    for key, value in dictionary.items():
        print(f"{key}: {value}")
    print()

Ship Name: 尼米兹
Location: 苏里高海峡
Nationality: 美
Date: 2023-02-20
File Name: File number 0
Ship Sentence: 美尼米兹号航母打击群驶离南海
Location Sentence: AIS信号显示，2月20日，尼米兹号航母打击群成员“迪云”号与“韦恩·迈耶”号驱逐舰经苏里高海峡进入菲律宾海。2月19日，打击群成员“迪卡特”号与“韦恩·迈耶”号驱逐舰在苏禄海活动，同日，一架美海军P-8A反潜巡逻机自菲律宾克拉克机场起飞，提供ISR(情报、侦察、监视)支援。据三艘驱逐舰航行轨迹推断，尼米兹号航母打击群已于2月18日自巴拉巴克海峡进入苏禄海，结束此次南海部署。
Nationality Sentence: 美尼米兹号航母打击群驶离南海
Date Sentence: 

Ship Name: 尼米兹
Location: 东部海域
Nationality: 美国
Date: 2023-04-17
File Name: File number 1
Ship Sentence: 尼米兹号航母进入南海
Location Sentence: 韩军：美国“里根”号航母重返朝鲜半岛东部海域
Nationality Sentence: 【文/观察者网王世纯】根据美国国防部下属的信息发布网站DVID在4月16日报道，4月16号，美国海军“尼米兹”号（CVN-68）及其护航舰艇又一次进入南海，进行水面、空中以及水下作战训练，以及固定翼舰载机和舰载直升机的飞行训练。这是“尼米兹”号航母打击群自去年12月部署以来第3次进入南海。
Date Sentence: 

Ship Name: 称
Location: 东部海域
Nationality: 日本
Date: 2016-07-10
File Name: File number 2
Ship Sentence: 日媒称，美国太平洋舰队日前宣称，部署在日本神奈川县横须贺基地的“罗纳德·里根”号核动力航母等第七舰队舰船在南海开展了警戒活动。消息发布于6月30日。
Location Sentence: 报道称，里根号6月在菲律宾东部海域的西太平洋上，与“约翰·斯坦尼斯”号实施了双航母联合训练。由里根号及随行的宙斯盾驱逐舰等组成的第五航母打击群指挥官约翰·亚历山大少将