In [None]:
from hanlp_restful import HanLPClient
import json
import os
import time
from gensim.models import Word2Vec
import glob

HanLP = HanLPClient('https://www.hanlp.com/api', auth="", language='zh')
HanLP.parse(["商品和服务。晓美焰来到北京立方庭参观自然语义科技公司。", "我爸是李刚"], tasks='pos/863')



{'tok/fine': [['商品',
   '和',
   '服务',
   '。',
   '晓美焰',
   '来到',
   '北京',
   '立方庭',
   '参观',
   '自然',
   '语义',
   '科技',
   '公司',
   '。'],
  ['我', '爸', '是', '李刚']],
 'pos/863': [['n',
   'c',
   'v',
   'w',
   'nh',
   'v',
   'ns',
   'ni',
   'v',
   'n',
   'n',
   'n',
   'n',
   'w'],
  ['r', 'n', 'v', 'nh']]}

In [3]:
import os
import re
import json
import time
from gensim.models import Word2Vec

# === Setup ===
root = "dataclean3"
output_model_path = "filtered_word2vec_pS.model"
query_timestamps = []
texts = []
file_stats = dict()

# Load stopwords
stopword_file = 'stopwords.txt'

with open(stopword_file, 'r', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f if line.strip())

print(f"✅ Loaded {len(stopwords)} stopwords.")

valid_pos_tags = {'n', 'v', 'a', 'ns'}

MAX_BATCH_SIZE = 200
MAX_CALLS_PER_MINUTE = 70
TIME_WINDOW = 60

# === Helpers ===

def split_into_sentences(text):
    sentences = re.split(r'(?<=[。！？”」])', text)
    return [s.strip() for s in sentences if s.strip()]

def batch_sentences(sentences, batch_size=200):
    for i in range(0, len(sentences), batch_size):
        yield sentences[i:i+batch_size]

def respect_rate_limit():
    if len(query_timestamps) >= MAX_CALLS_PER_MINUTE:
        gap = time.time() - query_timestamps[-MAX_CALLS_PER_MINUTE]
        if gap < TIME_WINDOW:
            sleep_time = TIME_WINDOW - gap
            print(f"\tSleeping {sleep_time:.2f} seconds to respect rate limit...")
            time.sleep(sleep_time)
    if len(query_timestamps) > MAX_CALLS_PER_MINUTE:
        del query_timestamps[:-MAX_CALLS_PER_MINUTE]

def flatten(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

def filter_tokens(data):
    tokens = flatten(data['tok/fine'])
    pos_tags = flatten(data['pos/863'])

    selected_tokens = []
    for token, pos in zip(tokens, pos_tags):
        if (pos in valid_pos_tags) and (token not in stopwords):
            selected_tokens.append(token)

    return selected_tokens

def process_batch(batch):
    try:
        query_timestamps.append(time.time())
        data = HanLP.parse(batch, tasks=['tok/fine', 'pos/863'])

        batch_filtered = []

        for sentence_idx in range(len(batch)):
            sentence_data = {
                'tok/fine': [data['tok/fine'][sentence_idx]],
                'pos/863': [data['pos/863'][sentence_idx]]
            }
            filtered_tokens = filter_tokens(sentence_data)
            if filtered_tokens:
                batch_filtered.append(filtered_tokens)

        return batch_filtered

    except Exception as e:
        print(f"❗ Error processing batch: {e}")
        return []

# === Main Loop ===

total_tokens_all_files = 0
total_sentences_all_files = 0

print("🔍 Starting to process files...")

for file_idx, file_name in enumerate(os.listdir(root), 1):
    if not file_name.endswith('.txt'):
        continue

    print(f"\n📄 ({file_idx}) Processing {file_name}...")
    file_path = os.path.join(root, file_name)

    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    sentences = split_into_sentences(text)
    file_sentences = []
    file_token_count = 0

    for batch_idx, batch in enumerate(batch_sentences(sentences, batch_size=MAX_BATCH_SIZE)):
        respect_rate_limit()
        filtered_batch = process_batch(batch)

        if filtered_batch:
            file_sentences.extend(filtered_batch)

    texts.extend(file_sentences)
    total_sentences_all_files += len(file_sentences)
    file_tokens = sum(len(sentence) for sentence in file_sentences)
    total_tokens_all_files += file_tokens

    print(f"📊 {file_name}: {len(file_sentences)} sentences, {file_tokens} tokens kept.")

print("\n✅ Token filtering complete.")
print(f"📈 Total sentences: {total_sentences_all_files}")
print(f"📈 Total tokens: {total_tokens_all_files}")

# === Save filtered sentences (optional) ===
with open("filtered_sentences2.json", "w", encoding="utf-8") as f:
    json.dump(texts, f, ensure_ascii=False, indent=2)
print("✅ Filtered sentences saved to 'filtered_sentences2.json'.")



✅ Loaded 29 stopwords.
🔍 Starting to process files...

📄 (1) Processing 不可思议.txt...
📊 不可思议.txt: 30 sentences, 238 tokens kept.

📄 (2) Processing 中央亚非利加之蛮地探险.txt...
📊 中央亚非利加之蛮地探险.txt: 199 sentences, 1688 tokens kept.

📄 (3) Processing 冒险夜行.txt...
📊 冒险夜行.txt: 34 sentences, 293 tokens kept.

📄 (4) Processing 冒险奇谈.txt...
📊 冒险奇谈.txt: 40 sentences, 390 tokens kept.

📄 (5) Processing 冒险小说孤店村.txt...
📊 冒险小说孤店村.txt: 80 sentences, 845 tokens kept.

📄 (6) Processing 冒险小说柏麦船长航海谈.txt...
📊 冒险小说柏麦船长航海谈.txt: 80 sentences, 837 tokens kept.

📄 (7) Processing 冒险小说火山岛.txt...
📊 冒险小说火山岛.txt: 75 sentences, 821 tokens kept.

📄 (8) Processing 冒险小说航海家儿.txt...
📊 冒险小说航海家儿.txt: 281 sentences, 2713 tokens kept.

📄 (9) Processing 冒险小说金沙窟.txt...
📊 冒险小说金沙窟.txt: 90 sentences, 1440 tokens kept.

📄 (10) Processing 冒险小说鬼域.txt...
📊 冒险小说鬼域.txt: 181 sentences, 1279 tokens kept.

📄 (11) Processing 冒险成功之商界伟人.txt...
📊 冒险成功之商界伟人.txt: 246 sentences, 2213 tokens kept.

📄 (12) Processing 冢中怪.txt...
📊 冢中怪.txt: 84 sentences, 996 token

In [None]:
with open("filtered_sentences1.json", "r", encoding="utf-8") as f:
    texts = json.load(f)


In [None]:
from gensim.models import word2vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)  # 输出日志信息
# sentences = word2vec.Text8Corpus('texts')  # 将语料保存在sentence中
model = word2vec.Word2Vec(texts, sg=1, vector_size=100,  window=5,  min_count=2,  negative=0, sample=0.001, hs=1, workers=4, epochs=20)  # 生成词向量空间模型
model.save('cre_all_word2vec1cS.model')  # 保存模型


In [22]:
from gensim.models import word2vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)  # 输出日志信息
# sentences = word2vec.Text8Corpus('texts')  # 将语料保存在sentence中
model = word2vec.Word2Vec(texts, sg=1, vector_size=100,  window=5,  min_count=2,  negative=0, sample=0.001, hs=1, workers=4, epochs=40)  # 生成词向量空间模型
model.save('cre_all_word2vec1cS1.model')  # 保存模型

2025-04-21 23:56:57,549 : INFO : collecting all words and their counts
2025-04-21 23:56:57,550 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-04-21 23:56:57,574 : INFO : PROGRESS: at sentence #10000, processed 92828 words, keeping 15917 word types
2025-04-21 23:56:57,589 : INFO : collected 22484 word types from a corpus of 169082 raw words and 17707 sentences
2025-04-21 23:56:57,589 : INFO : Creating a fresh vocabulary
2025-04-21 23:56:57,638 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 11221 unique words (49.91% of original 22484, drops 11263)', 'datetime': '2025-04-21T23:56:57.638369', 'gensim': '4.3.2', 'python': '3.10.13 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:24:38) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'prepare_vocab'}
2025-04-21 23:56:57,638 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 157819 word corpus (93.34% of original 169082, drops

In [26]:
import pandas as pd
from gensim.models import Word2Vec

# Load the trained Word2Vec model
model = Word2Vec.load("cre_all_word2vec1cS1.model")

# Define target words
target_words = ['中國', '中華', '支那', '神洲', '日本', '東洋', '英國', '英倫', '英吉利', '世界', '萬國', '天下']
  # ← some might not be in the vocab
topn = 30

# Dictionary to hold formatted results
results_dict = {}

# Collect most similar words for each valid word
for word in target_words:
    if word in model.wv:
        similar = model.wv.most_similar(word, topn=topn)
        formatted = [f"{token} ({score:.4f})" for token, score in similar]
        results_dict[word] = formatted
    else:
        print(f"⚠️ Skipping: '{word}' not in vocabulary")

# Create DataFrame only from valid words
df = pd.DataFrame(results_dict)

# Save to Excel (or CSV if you prefer)
df.to_excel("word2vec_top30_filtered_S7.xlsx", index=False)

# Preview
print("\n✅ Final DataFrame:")
print(df.head())

2025-04-21 23:58:27,369 : INFO : loading Word2Vec object from cre_all_word2vec1cS1.model
2025-04-21 23:58:27,464 : INFO : loading wv recursively from cre_all_word2vec1cS1.model.wv.* with mmap=None
2025-04-21 23:58:27,464 : INFO : setting ignored attribute cum_table to None
2025-04-21 23:58:27,464 : INFO : Word2Vec lifecycle event {'fname': 'cre_all_word2vec1cS1.model', 'datetime': '2025-04-21T23:58:27.464132', 'gensim': '4.3.2', 'python': '3.10.13 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:24:38) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'loaded'}


⚠️ Skipping: '神洲' not in vocabulary
⚠️ Skipping: '英吉利' not in vocabulary

✅ Final DataFrame:
            中國           中華            支那            日本            東洋  \
0  義士 (0.6281)  落成 (0.5019)   帆船 (0.5389)  正經事 (0.5836)    車 (0.4739)   
1  敬重 (0.5586)  慶祝 (0.4963)   水手 (0.4871)   留學 (0.5741)   回到 (0.4294)   
2  節令 (0.5353)  版圖 (0.4168)  印度洋 (0.4578)   律例 (0.5064)  巴不得 (0.4279)   
3  改用 (0.5217)  五洲 (0.4156)    輪 (0.4249)   買辦 (0.4808)   跳上 (0.4250)   
4  百姓 (0.5159)  彩球 (0.4048)    船 (0.4236)  小老婆 (0.4498)  雜貨行 (0.4233)   

            英國             英倫           世界             萬國           天下  
0  郵船 (0.5936)    商輪 (0.5424)  齷齪 (0.5479)  整整齊齊 (0.5200)  探險 (0.4606)  
1  上學 (0.5269)     爲 (0.4704)  光明 (0.5462)    公罪 (0.4993)  五洲 (0.4522)  
2  紐約 (0.5003)    赫赫 (0.4598)  文明 (0.5237)    照會 (0.4503)   國 (0.4473)  
3  士官 (0.4949)    失敗 (0.4556)  地質 (0.5132)    捕頭 (0.4348)   願 (0.3836)  
4  必到 (0.4765)  南太平洋 (0.4538)  骯髒 (0.4813)    保護 (0.4333)  寥寥 (0.3762)  


In [None]:


# Load the trained Word2Vec model
model = Word2Vec.load("cre_all_word2vec1cS1.model")

# Define target words
target_words = ['非洲', '亞非利加', '斐洲', '阿非利加', '菲州', '南洋', '西域', '歐洲', '歐羅巴洲', '亞東', '亞洲', '亞細亞', '亞西亞洲', '亞西亞', '亞細亞洲']
  # ← some might not be in the vocab
topn = 30

# Dictionary to hold formatted results
results_dict = {}

# Collect most similar words for each valid word
for word in target_words:
    if word in model.wv:
        similar = model.wv.most_similar(word, topn=topn)
        formatted = [f"{token} ({score:.4f})" for token, score in similar]
        results_dict[word] = formatted
    else:
        print(f"⚠️ Skipping: '{word}' not in vocabulary")

# Create DataFrame only from valid words
df = pd.DataFrame(results_dict)

# Save to Excel (or CSV if you prefer)
#df.to_excel("word2vec_top30_filtered_S_regions1.xlsx", index=False)

# Preview
print("\n✅ Final DataFrame:")
print(df.head())

In [None]:
model = Word2Vec.load("cre_all_word2vec1cS.model")

# Define target words
target_words = ['上海', '紐約', '倫敦', '孟買', '香港', '東京']
  # ← some might not be in the vocab
topn = 30

# Dictionary to hold formatted results
results_dict = {}

# Collect most similar words for each valid word
for word in target_words:
    if word in model.wv:
        similar = model.wv.most_similar(word, topn=topn)
        formatted = [f"{token} ({score:.4f})" for token, score in similar]
        results_dict[word] = formatted
    else:
        print(f"⚠️ Skipping: '{word}' not in vocabulary")

# Create DataFrame only from valid words
df = pd.DataFrame(results_dict)

# Save to Excel (or CSV if you prefer)
#df.to_excel("word2vec_top30_filtered_S_cities.xlsx", index=False)

# Preview
print("\n✅ Final DataFrame:")
print(df.head())