In [5]:
# 依赖库，按照所列，pip安装即可
import requests
import os
import json
import pandas as pd
import csv
import datetime
import dateutil.parser
import unicodedata
import time

In [6]:
# Twitter API密钥（需要学术版）
def auth(token):
    return token

# 创建HTTP请求头
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

# 创建请求地址
def create_url(keyword, start_date, end_date, max_results = 10):
    
    # 所调用的Twitter API终端，根据需求而定，详见Twitter API官方文档
    search_url = "https://api.twitter.com/2/tweets/search/all" 

    # 请求参数，此处需要根据自己需求调整，详见Twitter API官方文档
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,verified,url,withheld',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}
                   }
    
    return (search_url, query_params)

# 发起请求，返回调用结果
def connect_to_endpoint(url, headers, params, next_token = None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

# 将结果添加到本地csv
def append_to_csv(json_response, fileName, count): # 传参为connect_to_endpoint所返回的结果，以及保存路径还有计数器

    # 计数变量，不用管
    counter = 0

    # 打开csv文件文件并准备写入
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    # 遍历请求反返回的数据
    for i in range(len(json_response['data'])):
        # 每条tweet和user
        tweet = json_response['data'][i]
        for item in json_response['includes']['users']:
            if item['id'] == tweet['author_id']:
                user = item
                break
        # 解析JSON，即提取出我们所需的字段
        tweet_id = tweet['id'] if tweet['id'] else 'None'
        text = tweet['text']+ ' ' if tweet['text'] else 'None'
        author_id = tweet['author_id'] if tweet['author_id'] else 'None'
        name = user['name']+ ' ' if user['name'] else 'None'
        username = user['username']+ ' ' if user['username'] else 'None'
        if ('in_reply_to_user_id' in tweet):   
            in_reply_to_user_id = tweet['in_reply_to_user_id']
        else:
            in_reply_to_user_id = "None"
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = "None"
        if ('conversation_id' in tweet):   
            conversation_id = tweet['conversation_id']
        else:
            conversation_id = "None"
        created_at = tweet['created_at'] if tweet['created_at'] else 'None'
        lang = tweet['lang'] if tweet['lang'] else 'None'
        like_count = tweet['public_metrics']['like_count'] if tweet['public_metrics']['like_count'] else 0
        quote_count = tweet['public_metrics']['quote_count'] if tweet['public_metrics']['quote_count'] else 0
        reply_count = tweet['public_metrics']['reply_count'] if tweet['public_metrics']['reply_count'] else 0
        retweet_count = tweet['public_metrics']['retweet_count'] if tweet['public_metrics']['retweet_count'] else 0
        if ('referenced_tweets' in tweet):
            referenced_tweets_id = tweet['referenced_tweets'][0]['id']
            tweet_type = tweet['referenced_tweets'][0]['type']
        else:
            referenced_tweets_id = 'None'
            tweet_type = 'origin'          
        reply_settings = tweet['reply_settings'] if tweet['reply_settings'] else 'None'
        source = tweet['source'] if tweet['source'] else 'None'
        user_created_at = user['created_at'] if user['created_at'] else 'None'
        description = user['description']+ ' ' if user['description'] else 'None'
        if ('location' in user):
            location = user['location'] + ' '
        else:
            location = "None"        
        if ('profile_image_url' in user):   
            profile_image_url = user['profile_image_url']
        else:
            profile_image_url = "None"
        protected = user['protected'] if user['protected'] else 'False'
        followers_count = user['public_metrics']['followers_count'] if user['public_metrics']['followers_count'] else 0
        following_count = user['public_metrics']['following_count'] if user['public_metrics']['following_count'] else 0
        listed_count = user['public_metrics']['listed_count'] if user['public_metrics']['listed_count'] else 0
        tweet_count = user['public_metrics']['tweet_count'] if user['public_metrics']['tweet_count'] else 0
        verified = user['verified'] if user['verified'] else 'False'
        url = user['url'] if user['url'] else 'None'
        
        # 将提取的字段数据合并为一行
        res = [tweet_id,text,author_id,name,username,in_reply_to_user_id,geo,conversation_id,created_at,lang,like_count,quote_count,reply_count,retweet_count,referenced_tweets_id,tweet_type,reply_settings,source,
                user_created_at,description,location,profile_image_url,protected,followers_count,following_count,listed_count,tweet_count,verified,url]
        
        # 写入到csv文件当中
        csvWriter.writerow(res)
        counter += 1

    # 关闭文件写入
    csvFile.close()

    # 输出已添加的推文数量
    print("# of Tweets added from this response: ", counter) 

In [7]:
# 参数填写

# 获取api密钥
bearer_token = auth('AAAAAAAAAAAAAAAAAAAAABxIRQEAAAAAL7fzqTTYXbPyMEQRVjxkHx3VCRk%3DzsMFUXwoFFHb5vBoVvxIcLRsekLpjGmk1kr5DTRejmBCzPXv5Z')
# 创建HTTP请求头
headers = create_headers(bearer_token)
# 查询语句(很重要)，Twitter API官方文档有一套专门的查询语言：
keyword = "#Tencent OR #Alibaba OR #Huawei OR #ByteDance OR #CATL lang:en"
# 查询的日期，这里注意，start_list和end_list为一对一的关系，即2018-12-07T11:43:04.000Z到2019-01-06T00:00:00.000Z为一个查询区间，以此类推
start_list =    [
                 '2018-12-07T11:43:04.000Z',
                 '2019-01-13T00:00:00.000Z',
                 '2020-05-17T00:00:00.000Z',
                 '2021-01-01T00:00:00.000Z'
                ]

end_list =      [
                 '2019-01-06T00:00:00.000Z',
                 '2019-02-10T00:00:00.000Z',
                 '2020-05-31T00:00:00.000Z',
                 '2021-07-05T00:00:00.000Z'
                ]
# 单次请求的最大推文数量，Twitter官方建议是100，不过我用的学术版好像不止
max_results = 400
# 计数变量，统计所有推文数量，不用管
total_tweets = 0

# 创建文件并准备写入，请使用csv格式
csvFile = open("data2.csv", "a", newline="", encoding='utf-8-sig') # 此处填写路径
csvWriter = csv.writer(csvFile)

# 输入表格头
csvWriter.writerow(['tweet_id','text','author_id','name','username','in_reply_to_user_id','geo','conversation_id','created_at','lang','like_count','quote_count','reply_count','retweet_count','referenced_tweets_id','tweet_type','reply_settings','source',
                'user_created_at','description','location','profile_image_url','protected','followers_count','following_count','listed_count','tweet_count','verified','url'
                ])
csvFile.close()

In [8]:
# 执行部分

# 遍历日期
for i in range(0,len(start_list)):

    # 参数
    count = 0 # 记录数量的变量
    flag = True # 判断是否结束的变量
    next_token = None # 下一轮调用的密钥，前面已经解释过了
    counter = 0 # 记录第几次请求的变量
    
    # 判断是否获取结束，未结束则一直循环执行
    while flag:
        print("-------------------")
        print("Token: ", next_token)
        url = create_url(keyword, start_list[i],end_list[i], max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        tempStr = json.dumps(json_response)
        f = open(f'./archive1/{count}new_json{counter}.json', 'w') # 原始数据备份，以免意外
        f.write(tempStr)
        f.close()
        result_count = json_response['meta']['result_count']

        # 存在next_token说明还没获取完
        if 'next_token' in json_response['meta']:
            # 保存好next_token
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data2.csv", count) # 写入csv文件
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5) # 暂停几秒，以免超出速率限制   
        
        # 不存在next_token说明获取完了
        else:
            if result_count is not None and result_count > 0:
                print("-------------------")
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data2.csv", count)
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)
            
            # 完毕，改变flag
            flag = False
            next_token = None
        time.sleep(5)
        counter += 1

print("Total number of results: ", total_tweets)

-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fn0yk88bcj6uls93a797zb2j6mf88t
Start Date:  2018-12-07T11:43:04.000Z
# of Tweets added from this response:  391
Total # of Tweets added:  391
-------------------
-------------------
Token:  b26v89c19zqg8o3fn0yk88bcj6uls93a797zb2j6mf88t
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fn0yk888nies8gj3vmilsjbg6ozurh
Start Date:  2018-12-07T11:43:04.000Z
# of Tweets added from this response:  395
Total # of Tweets added:  786
-------------------


KeyboardInterrupt: 