In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [5]:
# 下载nltk所需数据
nltk.download('stopwords')
nltk.download('wordnet')

# 读取数据
data = pd.read_csv('abcnews-date-text.csv')
# 仅读取前100000条数据
data = data[:100000]

# 数据预处理
data.drop_duplicates(subset='headline_text', inplace=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/notyoursmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/notyoursmac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['headline_text'] = data['headline_text'].apply(preprocess_text)

In [7]:
# 使用TF-IDF表示文本
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['headline_text'])

# 查询处理函数
def search(query, top_n=10):
    query = preprocess_text(query)
    query_vec = vectorizer.transform([query])

    # 计算余弦相似度
    similarity_scores = cosine_similarity(query_vec, X)

    # 获取最相关的文档
    sorted_scores_idx = np.argsort(similarity_scores).flatten()[::-1][:top_n]
    return data.iloc[sorted_scores_idx]

In [10]:
# 测试搜索引擎
query = "china usa trade war"
results = search(query)
print(results)

       publish_date                         headline_text
25606      20030622             brazil hunt usa head home
48343      20031012       usa claim world cup third place
14743      20030501           clean clear pound warns usa
50354      20031022            govt look trade deal china
60150      20031209           trade dominate china u talk
41007      20030905  business group china highlight trade
50397      20031022       market focus china trade dollar
87391      20040428           china trade talk fast track
85956      20040420           vaile head china trade talk
37359      20030818  china australia hold free trade talk
