In [1]:
%%time
import pandas as pd
import re

from nltk import pos_tag # 词性
from nltk.tokenize import word_tokenize # 分词
from nltk.corpus import stopwords # 停用词
from nltk.stem import PorterStemmer, WordNetLemmatizer # 词干化，词态统一
from tqdm import tqdm  # 显示处理进度

import nltk

import pyarrow.parquet as pq
# run only once 第一次运行没下载要下载
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet') 
#nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

  from pandas.core.computation.check import NUMEXPR_INSTALLED


CPU times: user 2.64 s, sys: 603 ms, total: 3.24 s
Wall time: 3.77 s


In [9]:
%%time
# 加载数据
data = pd.read_parquet('nyt_data_part1.parquet')

CPU times: user 9.56 s, sys: 4.67 s, total: 14.2 s
Wall time: 12.3 s


In [10]:
%%time
# 取数据集的0.01%
data_sample = data.sample(frac=0.0001, random_state=42)
# 查看数据集的样本大小
print("Sample size:", len(data_sample))

# 查看数据的前几行
print(data_sample.head())

Sample size: 869
         year                                              title  \
6458866  1951  THIS being the week on which the Fourth of Jul...   
7196896  1954  H E Salisbury on restaurants News of Food: Sov...   
3179383  1936      Freed arrested EX-CAPONE AIDE HELD IN SLAYING   
7615113  1956  CHICAGO, July 8 (AP)--Detroit's revived Tigers...   
7140569  1954  Health Bd approves marketing of equal mixture ...   

                                                   excerpt  
6458866                                                     
7196896  Restaurant Cuisine in Moscow Still Has Little ...  
3179383  Edward Freed Questioned in Murder of Audrey Va...  
7615113  Detroit Collects 36 Hits in Sending Wilson, Pi...  
7140569  sale delayed pending rules NEW CREAM BLEND APP...  
CPU times: user 433 ms, sys: 72.3 ms, total: 505 ms
Wall time: 538 ms


In [11]:
%%time
# 查看数据的基本信息，包括列名、数据类型、非空值数量
data_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 869 entries, 6458866 to 3392847
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   year     869 non-null    int64 
 1   title    869 non-null    object
 2   excerpt  869 non-null    object
dtypes: int64(1), object(2)
memory usage: 27.2+ KB
CPU times: user 11.6 ms, sys: 16.3 ms, total: 27.9 ms
Wall time: 44.4 ms


In [12]:

%%time
# 清洗文本数据
def clean_text(text):
    # 去除特殊字符和标点符号
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # 小写化文本
    text = text.lower()
    return text

# 分词
def tokenize(text):
    words = nltk.word_tokenize(text)
    return words

# 分词和词形还原
def tokenize_and_lemmatize(text):
    tqdm.pandas(desc="Tokenizing")
    # 分词
    words = nltk.word_tokenize(text)
    # 词形还原
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

# 去停用词
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# 词干化
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]  # 添加进度条
    return stemmed_tokens

# NLTK词性标注函数
def nltk_tagger(text):
    # 分词
    tokens = word_tokenize(text)
    # 词性标注
    tagged_tokens = pos_tag(tokens)
    return tagged_tokens

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 10 µs


In [13]:
%%time
# 一次性解决四个任务，优化代码，减少循环次数。
def preprocess_text(text):
    # 基本处理
    cleaned_text = clean_text(text)
    # 词形还原
    lemmatized_tokens = tokenize_and_lemmatize(cleaned_text)
    # 去停用词加词干化
    filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words]
    return filtered_tokens

# 就这么用
# data_sample['cleaned_excerpt'] = data['excerpt'].apply(preprocess_text)

CPU times: user 4 µs, sys: 2 µs, total: 6 µs
Wall time: 8.82 µs


In [14]:
%%time
# 先将词性标注应用到数据集的每个'excerpt'
data_sample['tagged_excerpt'] = data_sample['excerpt'].apply(nltk_tagger)

CPU times: user 1.01 s, sys: 75.1 ms, total: 1.09 s
Wall time: 1.12 s


In [15]:
%%time
# 文本本身的处理
data_sample['cleaned_excerpt'] = data_sample['excerpt'].apply(preprocess_text)

CPU times: user 1.95 s, sys: 265 ms, total: 2.22 s
Wall time: 2.37 s


In [14]:
# %%time
# # 应用清洗函数
# data_sample['cleaned_excerpt'] = data_sample['excerpt'].apply(clean_text)

In [15]:
# %%time
# # 应用分词函数
# tqdm.pandas(desc="Tokenizing")# 使用tqdm来显示分词进度
# data_sample['tokenized_excerpt'] = data_sample['cleaned_excerpt'].progress_apply(tokenize_and_lemmatize)

In [11]:
# %%time
# # 去停用词
# data_sample['filtered_excerpt'] = data_sample['tokenized_excerpt'].apply(remove_stopwords)

In [12]:
# %%time
# # 词干化
# data_sample['stemmed_excerpt'] = data_sample['filtered_excerpt'].apply(stem_tokens)

In [18]:
%%time
# 查看处理后的数据
print(data_sample[['excerpt', 'cleaned_excerpt','tagged_excerpt']].head(1))

        excerpt cleaned_excerpt tagged_excerpt
6458866                      []             []
CPU times: user 3.76 ms, sys: 617 µs, total: 4.38 ms
Wall time: 5.83 ms


In [20]:
%%time
# 加载Loughran-McDonald Master Dictionary
dictionary_path = 'Loughran-McDonald_MasterDictionary_1993-2021.xlsx'
lm_dict = pd.read_excel(dictionary_path)

CPU times: user 22.1 s, sys: 592 ms, total: 22.7 s
Wall time: 23.6 s


In [21]:
%%time
# 将词典索引小写化
lm_dict['Word'] = lm_dict['Word'].str.lower()
lm_dict.set_index('Word', inplace=True)

# 然后在sentiment_score函数中使用小写的单词进行匹配
def sentiment_score(tagged_tokens, lm_dict):
    score = 0
    for word, tag in tagged_tokens:
        key = word.lower()  # 使用小写的词汇进行查找
        # 仅考虑词典中有的词
        if key in lm_dict.index:
            if lm_dict.loc[key, 'Positive'] > 0:
                score += 1
            elif lm_dict.loc[key, 'Negative'] > 0:
                score -= 1
    return score

# 注意确保传入的是词性标注过的列
data_sample['sentiment_score'] = data_sample['tagged_excerpt'].apply(lambda x: sentiment_score(x, lm_dict))
# 查看结果
print(data_sample[['excerpt', 'sentiment_score']].head())

                                                   excerpt  sentiment_score
6458866                                                                   0
7196896  Restaurant Cuisine in Moscow Still Has Little ...                1
3179383  Edward Freed Questioned in Murder of Audrey Va...               -1
7615113  Detroit Collects 36 Hits in Sending Wilson, Pi...               -1
7140569  sale delayed pending rules NEW CREAM BLEND APP...               -1
CPU times: user 52.1 s, sys: 1.04 s, total: 53.2 s
Wall time: 55 s


In [22]:
# 金融词频（总词频）统计，你想要的可能是每一年的词频统计，那么请你筛选出年份，然后把下面的data_sample替换为你筛选好的年份即可
from collections import Counter
# 提取词典中的所有词汇
all_words = lm_dict.index.tolist()
# 初始化一个词频统计字典
all_words_count = Counter()

In [23]:
%%time
# 遍历数据集中的文本
for excerpt in data_sample['cleaned_excerpt']:
    # 假设cleaned_excerpt已经是分词后的结果
    words = excerpt  # 如果cleaned_excerpt不是分词后的列表，需要先分词
    # 更新统计字典
    all_words_count.update(w for w in words if w in all_words)

# 输出最常见的词及其频率
print(all_words_count.most_common())

[('new', 43), ('say', 30), ('plan', 19), ('ha', 18), ('city', 18), ('report', 18), ('president', 18), ('would', 18), ('score', 17), ('company', 17), ('take', 16), ('union', 16), ('win', 16), ('program', 15), ('game', 15), ('group', 15), ('state', 15), ('two', 14), ('pay', 14), ('work', 13), ('system', 13), ('charge', 13), ('call', 13), ('board', 12), ('gain', 12), ('also', 12), ('first', 12), ('tell', 12), ('play', 12), ('may', 12), ('back', 12), ('get', 12), ('day', 12), ('set', 11), ('see', 11), ('year', 11), ('make', 11), ('rise', 11), ('give', 11), ('men', 11), ('general', 11), ('asks', 11), ('woman', 10), ('price', 10), ('official', 10), ('season', 10), ('issue', 10), ('head', 10), ('urge', 10), ('today', 10), ('seek', 10), ('line', 10), ('army', 10), ('trade', 10), ('held', 9), ('problem', 9), ('fund', 9), ('country', 9), ('fight', 9), ('record', 9), ('high', 9), ('week', 9), ('student', 9), ('aid', 9), ('world', 9), ('act', 9), ('war', 9), ('point', 9), ('open', 9), ('house', 9)

                                                    excerpt  sentiment_score
871974    will sail from Rye, June 11; plans and personn...                0
3563372   Straus Says Project, Held Up for Investigation...               -1
11774055                                                                   0
11142793  rally, presided over by Gen Eurico Jesus Deus ...               -3
10942590  lost record in the majors. When he was tired t...               -8
