In [8]:
import requests
import pandas as pd
import datetime
import time

def get_gdelt_news(date_str, keywords, max_records=1):
  
    
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
    query_keywords = " OR ".join(keywords)  # GDELT查询关词用OR连接

    params = {
        'query': query_keywords,
        'mode': 'ArtList',       # 返回文章列表
        'format': 'json',        # JSON格式
        'maxrecords': max_records,
        'startdatetime': date_str + "000000",
        'enddatetime': date_str + "235959",
        'sort': 'datedesc'
    }

    try:
        response = requests.get(base_url, params=params)
        data = response.json()

        articles = data.get('articles', [])
        titles = [article['title'] for article in articles]
        return titles
    except Exception as e:
        print(f"Error fetching GDELT data for {date_str}: {e}")
        return []

def generate_gdelt_news_file(start_date, end_date, keywords):

    dates = pd.date_range(start_date, end_date)
    rows = []

    for date in dates:
        date_str = date.strftime("%Y%m%d")
        titles = get_gdelt_news(date_str, keywords)
        row = [date.strftime("%Y-%m-%d")] + titles
        rows.append(row)
        print(f"Fetched {len(titles)} news for {date.strftime('%Y-%m-%d')}")
        time.sleep(1)  # 避免API请求过快，轻度限速

    # 为了统一列数，补齐空白列（假设最多10条新闻）
    max_len = 11  # 1列日期 + 10条新闻
    for i in range(len(rows)):
        while len(rows[i]) < max_len:
            rows[i].append("")

    columns = ["Date"] + [f"News {i+1}" for i in range(max_len-1)]
    df = pd.DataFrame(rows, columns=columns)
    df.to_csv("gdelt_news.csv", index=False)
    print("Saved news to JPY_news.csv")

# 示例调用：抓取2022年Q3期间，包含USD或CNY或exchange rate的新闻标题
generate_gdelt_news_file("2024-07-10", "2025-07-10",["THB"])


Fetched 1 news for 2024-07-10
Fetched 1 news for 2024-07-11
Fetched 1 news for 2024-07-12
Fetched 1 news for 2024-07-13
Fetched 1 news for 2024-07-14
Fetched 1 news for 2024-07-15
Fetched 1 news for 2024-07-16
Fetched 1 news for 2024-07-17
Fetched 1 news for 2024-07-18
Fetched 1 news for 2024-07-19
Fetched 1 news for 2024-07-20
Fetched 1 news for 2024-07-21
Fetched 1 news for 2024-07-22
Fetched 1 news for 2024-07-23
Fetched 1 news for 2024-07-24
Fetched 1 news for 2024-07-25
Fetched 1 news for 2024-07-26
Fetched 1 news for 2024-07-27
Fetched 1 news for 2024-07-28
Fetched 1 news for 2024-07-29
Fetched 1 news for 2024-07-30
Fetched 1 news for 2024-07-31
Fetched 1 news for 2024-08-01
Fetched 1 news for 2024-08-02
Fetched 1 news for 2024-08-03
Fetched 1 news for 2024-08-04
Fetched 1 news for 2024-08-05
Fetched 1 news for 2024-08-06
Fetched 1 news for 2024-08-07
Fetched 1 news for 2024-08-08
Fetched 1 news for 2024-08-09
Fetched 1 news for 2024-08-10
Fetched 1 news for 2024-08-11
Fetched 1 

In [7]:
import pandas as pd 
import numpy as np
def FinBERT_sentiment_score(news_list):
    """
    传入新闻标题列表，返回平均情感分数
    """
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
    tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
    finbert = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')
    nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
    # news_list 必须是 List[str]
    # 过滤掉非字符串和空字符串
    news_list = [str(i) for i in news_list if isinstance(i, str) and i.strip() != ""]
    if not news_list:  # 空列表直接返回0
        return 0
    results = nlp(news_list)
    scores = []
    for result in results:
        if result['label'] == "positive":
            scores.append(result['score'])
        elif result['label'] == "neutral":
            scores.append(0)
        else:
            scores.append(-result['score'])
    # 返回平均分
    return np.mean(scores)


def VADER_sentiment_score(heading):
    """
    compute sentiment score using pretrained VADER on -1 to 1 scale. -1 being negative and 1 being positive
    """
    import nltk
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    nltk.download('vader_lexicon')
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    analyzer = SentimentIntensityAnalyzer()
    result = analyzer.polarity_scores(heading)
    if result['pos'] == max(result['neg'], result['neu'], result['pos']):
        return result['pos']
    if result['neg'] == max(result['neg'], result['neu'], result['pos']):
        return (0 - result['neg'])
    else:
        return 0

news_df = pd.read_csv(r"C:\Users\wwwwang\Downloads\news to sentiment\JPY_news.csv")



BERT_sentiment = []


for i in range(len(news_df)):
    news_list = news_df.iloc[i, 1:].tolist()
    news_list = [i for i in news_list if i != '0']
    score_BERT = FinBERT_sentiment_score(news_list)
    BERT_sentiment.append(score_BERT)


# print(news_df.iloc[129])

news_df['FinBERT score'] = BERT_sentiment

news_df.to_csv("sentiment.csv")

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
