In [1]:
import csv
import re
import nltk
from nltk.stem import WordNetLemmatizer
import spacy

# python3 -m spacy download en_core_web_sm を実行する必要あるかも
nlp = spacy.load("en_core_web_sm")  # or other models


# 固有名詞を文中から抽出,大文字を小文字に変換、単語を基本形に変換
def pre_process(text: str) -> set[str]:
    # Process the text
    doc = nlp(text)

    # Extract proper nouns
    proper_nouns = [token.text for token in doc if token.pos_ == "PROPN"]

    # 基本形に変換、すでに小文字化して返す、固有名詞は大文字のまま
    lemmatized_sentence = [token.lemma_ for token in doc if token.is_alpha]
    lemma_extract_propn = (
        set(lemmatized_sentence)
        - set(proper_nouns)
        - {word.lower() for word in proper_nouns}
    )
    return lemma_extract_propn


# 前処理した単語の集合と、受験で覚えるべき単語の集合を比べる
def check_none_testwords_in_text(file_path: str, text: set[str]) -> set[str]:
    # 単語を格納するリストを初期化
    words_for_test = set()

    # CSVファイルを開く
    with open(file_path, newline="") as file:
        reader = csv.reader(file)

        # 各行を読み込み、単語をリストに追加
        for row in reader:
            # rowはリスト形式なので、最初の要素を取得
            word = row[0]
            words_for_test.add(word)

    # words_for_testに含まれていない単語を、textから抜き出す
    not_included = text - words_for_test
    return not_included


# 上の二つの関数を合わせる
def set_none_words(file_path: str, text: set[str]):
    text_pre_process = pre_process(text)
    answer = check_none_testwords_in_text(file_path, text_pre_process)
    return answer



In [2]:

# CSVファイルのパス
file_path_high = "../word_list/csv_folder/word_list_j_high_school.csv"
file_path_center = "../word_list/csv_folder/word_list_center_test.csv"
file_path_uni = "../word_list/csv_folder/word_list_2zi_test.csv"


# ファイルを開いて内容を読み込む
with open("input_text.txt", "r") as file:
    text = file.read()

print(
    "高校受験用の単語集合と比較",
    len(set_none_words(file_path_high, text)),
    set_none_words(file_path_high, text),
)
print(
    "高校受験用の単語集合と比較",
    len(set_none_words(file_path_center, text)),
    set_none_words(file_path_center, text),
)
print(
    "大学受験用の単語集合と比較",
    len(set_none_words(file_path_uni, text)),
    set_none_words(file_path_uni, text),
)

高校受験用の単語集合と比較 7 {'okay', 'collapse', 'boss', 'drunk', 'awareness', 'co', 'reveal'}
高校受験用の単語集合と比較 5 {'okay', 'boss', 'drunk', 'co', 'reveal'}
大学受験用の単語集合と比較 4 {'drunk', 'boss', 'co', 'okay'}


In [3]:
def pick_propm(text:str)->set[str]:
    # Process the text
    doc = nlp(text)
    # Extract proper nouns
    proper_nouns = [token.text for token in doc if token.pos_ == "PROPN"]
    return set(proper_nouns)

print(pick_propm(text))


{'Hakata', 'Station', 'Yamauchi', 'Daikichi', 'Shinagawa', 'Daigo'}


In [4]:
def lem(text:str)->set[str]:
    doc = nlp(text)
    lemmatized_sentence = [token.lemma_ for token in doc]
    return set(lemmatized_sentence)
print(lem(text))

{'make', 'a', 'star', 'share', 'to', 'old', 'will', 'ago', 'collapse', 'explain', 'have', 'his', 'friend', 'say', 'officer', 'such', 'recently', 'happen', 'boss', 'third', 'story', 'do', ',', 'reveal', 'Daigo', 'an', 'Hakata', 'when', 'guess', 'look', 'night', 'it', 'immediately', 'problem', 'like', 'help', 'two', '"', 'Yamauchi', 'station', 'there', 'you', 'the', 'be', 'opinion', 'week', 'drunk', 'might', 'get', 'close', 'answer', 'on', 'they', 'of', 'ask', 'Shinagawa', 'laugh', 'no', 'co', '-', 'if', 'still', 'well', 'think', 'surely', 'Station', 'he', 'late', '\n\n', 'okay', '.', 'that', 'see', 'surprised', 'Daikichi', 'near', 'also', 'at', 'then', 'go', 'awareness', 'and', 'bit', 'this', 'unexpected', '?', 'thing', 'someone', 'police', 'by'}
